feat(ml): export clip models to ONNX and host models on Hugging Face (#4700)

* export clip models * export to hf refactored export code * export mclip, general refactoring cleanup * updated conda deps * do transforms with pillow and numpy, add tokenization config to export, general refactoring * moved conda dockerfile, re-added poetry * minor fixes * updated link * updated tests * removed `requirements.txt` from workflow * fixed mimalloc path * removed torchvision * cleaner np typing * review suggestions * update default model name * update test
2025-07-31 09:08:20 +02:00 · 2023-10-31 06:02:04 -04:00 · 2023-10-31 06:02:04 -04:00 · 87a0ba3db3
commit 87a0ba3db3
parent 3212a47720
29 changed files with 6192 additions and 2043 deletions
--- a/machine-learning/locustfile.py
+++ b/machine-learning/locustfile.py
@ -1,11 +1,12 @@
-from io import BytesIO
 import json
+from argparse import ArgumentParser
+from io import BytesIO
 from typing import Any

 from locust import HttpUser, events, task
 from locust.env import Environment
 from PIL import Image
-from argparse import ArgumentParser
+
 byte_image = BytesIO()


@ -14,11 +15,21 @@ def _(parser: ArgumentParser) -> None:
    parser.add_argument("--tag-model", type=str, default="microsoft/resnet-50")
    parser.add_argument("--clip-model", type=str, default="ViT-B-32::openai")
    parser.add_argument("--face-model", type=str, default="buffalo_l")
-    parser.add_argument("--tag-min-score", type=int, default=0.0, 
-                        help="Returns all tags at or above this score. The default returns all tags.")
-    parser.add_argument("--face-min-score", type=int, default=0.034, 
-                        help=("Returns all faces at or above this score. The default returns 1 face per request; "
-                              "setting this to 0 blows up the number of faces to the thousands."))
+    parser.add_argument(
+        "--tag-min-score",
+        type=int,
+        default=0.0,
+        help="Returns all tags at or above this score. The default returns all tags.",
+    )
+    parser.add_argument(
+        "--face-min-score",
+        type=int,
+        default=0.034,
+        help=(
+            "Returns all faces at or above this score. The default returns 1 face per request; "
+            "setting this to 0 blows up the number of faces to the thousands."
+        ),
+    )
    parser.add_argument("--image-size", type=int, default=1000)


@ -62,7 +73,7 @@ class CLIPTextFormDataLoadTest(InferenceLoadTest):
            ("modelName", self.environment.parsed_options.clip_model),
            ("modelType", "clip"),
            ("options", json.dumps({"mode": "text"})),
-            ("text", "test search query")
+            ("text", "test search query"),
        ]
        self.client.post("/predict", data=data)

@ -88,5 +99,5 @@ class RecognitionFormDataLoadTest(InferenceLoadTest):
            ("options", json.dumps({"minScore": self.environment.parsed_options.face_min_score})),
        ]
        files = {"image": self.data}
-            
+
        self.client.post("/predict", data=data, files=files)