feat(ml): export clip models to ONNX and host models on Hugging Face (#4700)

* export clip models * export to hf refactored export code * export mclip, general refactoring cleanup * updated conda deps * do transforms with pillow and numpy, add tokenization config to export, general refactoring * moved conda dockerfile, re-added poetry * minor fixes * updated link * updated tests * removed `requirements.txt` from workflow * fixed mimalloc path * removed torchvision * cleaner np typing * review suggestions * update default model name * update test
2025-08-10 22:29:15 +02:00 · 2023-10-31 06:02:04 -04:00 · 2023-10-31 06:02:04 -04:00 · 87a0ba3db3
commit 87a0ba3db3
parent 3212a47720
29 changed files with 6192 additions and 2043 deletions
--- a/machine-learning/app/conftest.py
+++ b/machine-learning/app/conftest.py
@ -1,5 +1,6 @@
 import json
-from typing import Any, Iterator, TypeAlias
+from pathlib import Path
+from typing import Any, Iterator
 from unittest import mock

 import numpy as np
@ -8,8 +9,7 @@ from fastapi.testclient import TestClient
 from PIL import Image

 from .main import app, init_state
-
-ndarray: TypeAlias = np.ndarray[int, np.dtype[np.float32]]
+from .schemas import ndarray_f32


@pytest.fixture
@ -18,13 +18,13 @@ def pil_image() -> Image.Image:


@pytest.fixture
-def cv_image(pil_image: Image.Image) -> ndarray:
+def cv_image(pil_image: Image.Image) -> ndarray_f32:
    return np.asarray(pil_image)[:, :, ::-1]  # PIL uses RGB while cv2 uses BGR


@pytest.fixture
 def mock_get_model() -> Iterator[mock.Mock]:
-    with mock.patch("app.models.cache.InferenceModel.from_model_type", autospec=True) as mocked:
+    with mock.patch("app.models.cache.from_model_type", autospec=True) as mocked:
        yield mocked


@ -37,3 +37,25 @@ def deployed_app() -> TestClient:
@pytest.fixture(scope="session")
 def responses() -> dict[str, Any]:
    return json.load(open("responses.json", "r"))
+
+
+@pytest.fixture(scope="session")
+def clip_model_cfg() -> dict[str, Any]:
+    return {
+        "embed_dim": 512,
+        "vision_cfg": {"image_size": 224, "layers": 12, "width": 768, "patch_size": 32},
+        "text_cfg": {"context_length": 77, "vocab_size": 49408, "width": 512, "heads": 8, "layers": 12},
+    }
+
+
+@pytest.fixture(scope="session")
+def clip_preprocess_cfg() -> dict[str, Any]:
+    return {
+        "size": [224, 224],
+        "mode": "RGB",
+        "mean": [0.48145466, 0.4578275, 0.40821073],
+        "std": [0.26862954, 0.26130258, 0.27577711],
+        "interpolation": "bicubic",
+        "resize_mode": "shortest",
+        "fill_color": 0,
+    }