feat: add icon and description for Stable Diffusion benchmark (#917)

anhappdev · web-flow · commit aab26977c0ae · 2024-12-17T14:00:13.000+07:00
* Add icon for stable_diffusion task

* Add description for stable_diffusion task

* Sort the order of task based on BenchmarkId.allIds

* Fix ios-build-test.yml

* Fix ios-build-test.yml
diff --git a/.github/workflows/ios-build-test.yml b/.github/workflows/ios-build-test.yml
@@ -10,7 +10,7 @@ jobs:
   build:
     name: Build and test iOS app
     # https://github.com/actions/runner-images/blob/main/images/macos/macos-12-Readme.md
-    runs-on: macos-12
+    runs-on: macos-13
     timeout-minutes: 120
     env:
       PERF_TEST: true
diff --git a/flutter/assets/icons/ic_task_stable_diffusion.svg b/flutter/assets/icons/ic_task_stable_diffusion.svg
@@ -0,0 +1,12 @@
+<svg width="1000" height="1024" viewBox="0 0 1000 1024" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g clip-path="url(#clip0_2_2)">
+<path d="M953.125 699H546.875C520.986 699 500 678.014 500 652.125V370.875C500 344.986 520.986 324 546.875 324H953.125C979.014 324 1000 344.986 1000 370.875V652.125C1000 678.014 979.014 699 953.125 699ZM609.375 378.688C579.172 378.688 554.688 403.172 554.688 433.375C554.688 463.578 579.172 488.063 609.375 488.063C639.578 488.063 664.062 463.578 664.062 433.375C664.062 403.172 639.578 378.688 609.375 378.688ZM562.5 636.5H937.5V527.125L852.036 441.661C847.46 437.085 840.04 437.085 835.463 441.661L703.125 574L648.911 519.786C644.335 515.21 636.915 515.21 632.338 519.786L562.5 589.625V636.5Z" fill="#26507D"/>
+<path d="M233.333 345.333H100C91.1594 345.333 82.6809 348.845 76.4297 355.096C70.1785 361.348 66.6666 369.826 66.6666 378.667V645.333C66.6666 654.174 70.1785 662.652 76.4297 668.904C82.6809 675.155 91.1594 678.667 100 678.667H300C308.841 678.667 317.319 675.155 323.57 668.904C329.821 662.652 333.333 654.174 333.333 645.333V445.333M233.333 345.333L333.333 445.333M233.333 345.333V445.333H333.333M266.667 528.667H133.333M266.667 595.333H133.333M166.667 462H133.333" stroke="#26507D" stroke-width="20" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M372 512H428M428 512L400 484M428 512L400 540" stroke="#26507D" stroke-width="20" stroke-linecap="round" stroke-linejoin="round"/>
+</g>
+<defs>
+<clipPath id="clip0_2_2">
+<rect width="1000" height="1024" fill="white"/>
+</clipPath>
+</defs>
+</svg>
diff --git a/flutter/assets/icons/ic_task_stable_diffusion_white.svg b/flutter/assets/icons/ic_task_stable_diffusion_white.svg
@@ -0,0 +1,12 @@
+<svg width="1000" height="1024" viewBox="0 0 1000 1024" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g clip-path="url(#clip0_102_52)">
+<path d="M953.125 699H546.875C520.986 699 500 678.014 500 652.125V370.875C500 344.986 520.986 324 546.875 324H953.125C979.014 324 1000 344.986 1000 370.875V652.125C1000 678.014 979.014 699 953.125 699ZM609.375 378.688C579.172 378.688 554.688 403.172 554.688 433.375C554.688 463.578 579.172 488.063 609.375 488.063C639.578 488.063 664.062 463.578 664.062 433.375C664.062 403.172 639.578 378.688 609.375 378.688ZM562.5 636.5H937.5V527.125L852.036 441.661C847.46 437.085 840.04 437.085 835.463 441.661L703.125 574L648.911 519.786C644.335 515.21 636.915 515.21 632.338 519.786L562.5 589.625V636.5Z" fill="white"/>
+<path d="M233.333 345.333H100C91.1595 345.333 82.6811 348.845 76.4299 355.096C70.1786 361.348 66.6667 369.826 66.6667 378.667V645.333C66.6667 654.174 70.1786 662.652 76.4299 668.904C82.6811 675.155 91.1595 678.667 100 678.667H300C308.841 678.667 317.319 675.155 323.57 668.904C329.822 662.652 333.333 654.174 333.333 645.333V445.333M233.333 345.333L333.333 445.333M233.333 345.333V445.333H333.333M266.667 528.667H133.333M266.667 595.333H133.333M166.667 462H133.333" stroke="white" stroke-width="20" stroke-linecap="round" stroke-linejoin="round"/>
+<path d="M372 512H428M428 512L400 484M428 512L400 540" stroke="white" stroke-width="20" stroke-linecap="round" stroke-linejoin="round"/>
+</g>
+<defs>
+<clipPath id="clip0_102_52">
+<rect width="1000" height="1024" fill="white"/>
+</clipPath>
+</defs>
+</svg>
diff --git a/flutter/integration_test/expected_throughput.dart b/flutter/integration_test/expected_throughput.dart
@@ -26,7 +26,7 @@ const _kS22Ultra = 'SM-S908U1'; // Galaxy S22 Ultra
 const _kDN2103 = 'DN2103'; // OnePlus DN2103
 
 // iOS
-const _kIphoneOnGitHubAction = 'iPhone15,3';
+const _kIphoneOnGitHubAction = 'iPhone16,2';
 const _kIphoneOnMacbookM1 = 'iPhone14,7';
 
 const Map<String, Map<String, Interval>> _imageClassificationV2 = {
diff --git a/flutter/lib/app_constants.dart b/flutter/lib/app_constants.dart
@@ -24,14 +24,15 @@ class BenchmarkId {
   static const imageClassificationOfflineV2 = 'image_classification_offline_v2';
   static const stableDiffusion = 'stable_diffusion';
 
+  // The sort order of this list will be used in the UI
   static const allIds = [
+    imageClassificationV2,
     objectDetection,
     imageSegmentationV2,
     naturalLanguageProcessing,
     superResolution,
-    imageClassificationV2,
-    imageClassificationOfflineV2,
     stableDiffusion,
+    imageClassificationOfflineV2,
   ];
 }
 
diff --git a/flutter/lib/benchmark/benchmark.dart b/flutter/lib/benchmark/benchmark.dart
@@ -130,7 +130,11 @@ class BenchmarkStore {
     required List<pb.BenchmarkSetting> backendConfig,
     required Map<String, bool> taskSelection,
   }) {
-    for (final task in appConfig.task) {
+    // sort the order of task based on BenchmarkId.allIds
+    final List<pb.TaskConfig> sortedTasks = List.from(appConfig.task)
+      ..sort((a, b) =>
+          BenchmarkId.allIds.indexOf(a.id) - BenchmarkId.allIds.indexOf(b.id));
+    for (final task in sortedTasks) {
       final backendSettings = backendConfig
           .singleWhereOrNull((setting) => setting.benchmarkId == task.id);
       if (backendSettings == null) {
diff --git a/flutter/lib/benchmark/info.dart b/flutter/lib/benchmark/info.dart
@@ -63,6 +63,12 @@ class BenchmarkInfo {
           detailsTitle: stringResources.benchInfoSuperResolution,
           detailsContent: stringResources.benchInfoSuperResolutionDesc,
         );
+      case (BenchmarkId.stableDiffusion):
+        return BenchmarkLocalizationInfo(
+          name: stringResources.benchNameStableDiffusion,
+          detailsTitle: stringResources.benchInfoStableDiffusion,
+          detailsContent: stringResources.benchInfoStableDiffusionDesc,
+        );
       default:
         throw 'unhandled task id: ${task.id}';
     }
diff --git a/flutter/lib/l10n/app_en.arb b/flutter/lib/l10n/app_en.arb
@@ -102,17 +102,20 @@
   "benchNameLanguageProcessing": "Language Processing",
   "benchNameImageClassificationOffline": "Image Classification (offline)",
   "benchNameSuperResolution": "Super Resolution",
+  "benchNameStableDiffusion": "Stable Diffusion",
   "benchInfoImageClassification": "Image Classification",
   "benchInfoObjectDetection": "Object detection",
   "benchInfoImageSegmentation": "Image Segmentation",
   "benchInfoLanguageProcessing": "Language Processing",
   "benchInfoSuperResolution": "Super Resolution",
+  "benchInfoStableDiffusion": "Stable Diffusion",
   "benchInfoImageClassificationDesc": "Image classification picks the best label to describe an input image and is commonly used for photo search and text extraction. The MobileNetEdgeTPU reference model is evaluated on the ImageNet 2012 validation dataset and requires a minimum accuracy of 74.66% (98% of FP32 accuracy of 76.19%) Top-1 accuracy (For Performance measurements, App uses a different dataset).\n\nThe MobileNetEdgeTPU network is a descendent of the MobileNet-v2 family that is optimized for low-latency and mobile accelerators. The MobileNetEdgeTPU model architecture is based on convolutional layers with inverted residuals and linear bottlenecks, similar to MobileNet v2, but is optimized by introducing fused inverted bottleneck convolutions to improve hardware utilization, and removing hard-swish and squeeze-and-excite blocks.\n\nThe offline variant of image classification has no latency constraints and typically uses batched inference and has higher throughput.",
   "benchInfoImageClassificationV2Desc": "Image classification picks the best label to describe an input image and is commonly used for photo search and text extraction.\n\nThe MobileNetV4-Conv-L model boasts an impressive 83% accuracy with the ImageNet dataset, versus 76% accuracy for the prior standard, MobileNetEdgeTPU. MobileNetV4-Conv-L is designed to perform well across a range of mobile processor types, from CPUs and GPUs to neural accelerators. The MLPerf Mobile working group worked closely with the MobileNetV4 team in order to ensure optimized performance. This combination of an improved model architecture and collaborative optimization has proven quite potent. Although MobileNetV4-Conv-L executes six times the number of mathematical operations of its predecessor, MobileNetEdgeTPU, benchmark execution times have only increased by a factor of roughly 4.6.\n\nThe offline variant of image classification has no latency constraints and typically uses batched inference and has higher throughput.",
   "benchInfoObjectDetectionDesc": "Object detection draws bounding boxes around recognized objects in an input image, assigning each one a label. This is a common approach for identifying objects in photos, and automotive safety. Since v1.0, our reference model has been updated to MobileDets (from v0.7 model,  Single Shot Detector with a MobileNet-v2 feature extractor operating). MobileDets are trained on the COCO 2017 validation dataset. The MobileDets Object Detection task is evaluated on the COCO 2017 dataset with an input image resolution of 320x320. It requires a minimum  mean Average Precision (mAP) of 27.075 (95% of FP32 mAP of 28.5%), which is significantly higher than that of the previous model.\n\nMobileDets are searched for object detection. A key feature of MobileDets is that the search space includes both inverted bottleneck blocks and regular convolution operations to help improve the accuracy-latency trade-off on several hardware accelerators.",
   "benchInfoImageSegmentationDesc": "Semantic image segmentation partitions an input image into labeled objects at pixel granularity, and is used for complex image manipulation such as red-eye reduction as well as automotive and medical applications. The reference model is the MOSAIC network paired with a tailored feature extraction backbone. It operates on 512x512 resolution input images from the ADE20K validation set and requires a minimum mean Intersection Over Union (mIoU) value of 57.36% (96% of FP32 mIoU of 59.75%), significantly higher than the previous segmentation model (MobileNetv2-Deeplabv3+).\n\nMOSAIC employs a simple asymmetric encoder-decoder structure which consists of an efficient multi-scale context encoder and a light-weight hybrid decoder to recover spatial details from aggregated information with multiple lateral connections between the two. The feature extractor is a variant of MobileNet Multi-Hardware, which is a network built and optimized with neural architecture search. It is further enhanced for image segmentation by reducing the output stride, adding dilated convolutions at the end stage, and halving the feature channels.",
   "benchInfoLanguageProcessingDesc": "Question Answering finds the best answer to an input question based on a body of text, and is commonly employed in applications such as virtual assistants and chatbots. The reference model, MobileBERT, is evaluated on the Stanford Question Answering Dataset (SQUAD) v1.1 Dev-mini. The task requires a minimum F1-score of 87.4% (93% of FP32 F1-score of 93.08%).\n\nMobileBERT is a streamlined, mobile-optimized version of the larger BERT_LARGE network. It features bottleneck structures and a carefully designed balance between self-attention and feed-forward networks. While BERT is task-agnostic and can be applied to various downstream natural language processing tasks, the MobileBERT variant used in MLPerf is specifically fine-tuned for question answering.",
   "benchInfoSuperResolutionDesc": "Image Super Resolution (SR) upscales a lower resolution input into a higher resolution output image, enhancing the quality and detail. It is a common task in many mobile applications such as digital zoom. The reference model, EDSR F32B5, is a lightweight member of the Enhanced Deep Super Resolution (EDSR) family that is trained for 2X super resolution on the DIV2K dataset with bicubic downsampling and tested on the OpenSR test-set which comprises 25 selected 1920x1080 HDR images. The benchmark requires a minimum accuracy of 33 dB Peak Signal to Noise Ratio (PSNR) relative to a 33.58 dB accuracy with FP32.\n\nThe EDSR family of models demonstrated excellent performance by winning a super resolution challenge at CVPR 2017. The EDSR F32B5 reference model features five EDSR blocks, each with 32 feature maps. The EDSR block is a simple residual block consisting of a residual connection on one branch and a convolution-ReLU-convolution on the other branch. The final upsampling layer is a depth-to-space operator, which facilitates the x2 super resolution process.",
+  "benchInfoStableDiffusionDesc": "The Text to Image Gen AI benchmark adopts Stable Diffusion v1.5 for generating images from text prompts. It is a latent diffusion model. The benchmarked Stable Diffusion v1.5 refers to a specific configuration of the model architecture that uses a downsampling-factor 8 autoencoder with an 860M UNet,123M CLIP ViT-L/14 text encoder for the diffusion model, and VAE Decoder of 49.5M parameters. The model was trained on 595k steps at resolution of 512x512, which enables it to generate high quality images. We refer you to https://huggingface.co/benjamin-paine/stable-diffusion-v1-5 for more information. The benchmark runs 20 denoising steps for inference, and uses a precalculated time embedding of size 1x1280. Reference models can be found here https://github.com/mlcommons/mobile_open/releases.\n\nFor latency benchmarking, we benchmark end to end, excluding the time embedding calculation and the tokenizer. For accuracy calculations, the app adopts the CLIP metric for text-to-image consistency, and further evaluation of the generated images using this Image Quality Aesthetic Assessment metric https://github.com/idealo/image-quality-assessment/tree/master?tab=readme-ov-file",
 
   "resourceErrorMessage": "Some resources failed to load.\nIf you didn't change config from default you can try clearing the cache.\nIf you use a custom configuration file ensure that it has correct structure or switch back to default config.",
   "resourceErrorSelectTaskFile": "Update task configuration",
diff --git a/flutter/lib/ui/icons.dart b/flutter/lib/ui/icons.dart
@@ -26,10 +26,8 @@ class AppIcons {
       _pSvg('ic_task_image_classification_offline.svg');
   static final SvgPicture superResolution =
       _pSvg('ic_task_super_resolution.svg');
-
-  // TODO (anhappdev): update icon
   static final SvgPicture stableDiffusion =
-      _pSvg('ic_task_super_resolution.svg');
+      _pSvg('ic_task_stable_diffusion.svg');
 
   static final SvgPicture imageClassificationWhite =
       _pSvg('ic_task_image_classification_white.svg');
@@ -43,10 +41,8 @@ class AppIcons {
       _pSvg('ic_task_image_classification_offline_white.svg');
   static final SvgPicture superResolutionWhite =
       _pSvg('ic_task_super_resolution_white.svg');
-
-  // TODO (anhappdev): update icon
   static final SvgPicture stableDiffusionWhite =
-      _pSvg('ic_task_super_resolution_white.svg');
+      _pSvg('ic_task_stable_diffusion_white.svg');
 
   static final SvgPicture arrow = _pSvg('ic_arrow.svg');
 
@@ -71,6 +67,7 @@ class BenchmarkIcons {
     BenchmarkId.imageSegmentationV2: AppIcons.imageSegmentation,
     BenchmarkId.naturalLanguageProcessing: AppIcons.languageProcessing,
     BenchmarkId.superResolution: AppIcons.superResolution,
+    BenchmarkId.stableDiffusion: AppIcons.stableDiffusion,
     BenchmarkId.imageClassificationOfflineV2:
         AppIcons.imageClassificationOffline,
   };
@@ -81,6 +78,7 @@ class BenchmarkIcons {
     BenchmarkId.imageSegmentationV2: AppIcons.imageSegmentationWhite,
     BenchmarkId.naturalLanguageProcessing: AppIcons.languageProcessingWhite,
     BenchmarkId.superResolution: AppIcons.superResolutionWhite,
+    BenchmarkId.stableDiffusion: AppIcons.stableDiffusionWhite,
     BenchmarkId.imageClassificationOfflineV2:
         AppIcons.imageClassificationOfflineWhite,
   };