diff --git a/machine-learning/app/config.py b/machine-learning/app/config.py
index ab27499a25..9b98eecded 100644
--- a/machine-learning/app/config.py
+++ b/machine-learning/app/config.py
@@ -12,8 +12,6 @@ from rich.logging import RichHandler
 from uvicorn import Server
 from uvicorn.workers import UvicornWorker
 
-from .schemas import ModelType
-
 
 class PreloadModelData(BaseModel):
     clip: str | None
@@ -21,7 +19,7 @@ class PreloadModelData(BaseModel):
 
 
 class Settings(BaseSettings):
-    cache_folder: str = "/cache"
+    cache_folder: Path = Path("/cache")
     model_ttl: int = 300
     model_ttl_poll_s: int = 10
     host: str = "0.0.0.0"
@@ -55,14 +53,6 @@ def clean_name(model_name: str) -> str:
     return model_name.split("/")[-1].translate(_clean_name)
 
 
-def get_cache_dir(model_name: str, model_type: ModelType) -> Path:
-    return Path(settings.cache_folder) / model_type.value / clean_name(model_name)
-
-
-def get_hf_model_name(model_name: str) -> str:
-    return f"immich-app/{clean_name(model_name)}"
-
-
 LOG_LEVELS: dict[str, int] = {
     "critical": logging.ERROR,
     "error": logging.ERROR,
diff --git a/machine-learning/app/main.py b/machine-learning/app/main.py
index 277ad76898..3c607015d9 100644
--- a/machine-learning/app/main.py
+++ b/machine-learning/app/main.py
@@ -6,22 +6,34 @@ import threading
 import time
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import asynccontextmanager
+from functools import partial
 from typing import Any, AsyncGenerator, Callable, Iterator
 from zipfile import BadZipFile
 
 import orjson
-from fastapi import Depends, FastAPI, Form, HTTPException, UploadFile
+from fastapi import Depends, FastAPI, File, Form, HTTPException
 from fastapi.responses import ORJSONResponse
 from onnxruntime.capi.onnxruntime_pybind11_state import InvalidProtobuf, NoSuchFile
+from PIL.Image import Image
+from pydantic import ValidationError
 from starlette.formparsers import MultiPartParser
 
+from app.models import get_model_deps
 from app.models.base import InferenceModel
+from app.models.transforms import decode_pil
 
 from .config import PreloadModelData, log, settings
 from .models.cache import ModelCache
 from .schemas import (
+    InferenceEntries,
+    InferenceEntry,
+    InferenceResponse,
     MessageResponse,
+    ModelIdentity,
+    ModelTask,
     ModelType,
+    PipelineRequest,
+    T,
     TextResponse,
 )
 
@@ -63,12 +75,21 @@ async def lifespan(_: FastAPI) -> AsyncGenerator[None, None]:
         gc.collect()
 
 
-async def preload_models(preload_models: PreloadModelData) -> None:
-    log.info(f"Preloading models: {preload_models}")
-    if preload_models.clip is not None:
-        await load(await model_cache.get(preload_models.clip, ModelType.CLIP))
-    if preload_models.facial_recognition is not None:
-        await load(await model_cache.get(preload_models.facial_recognition, ModelType.FACIAL_RECOGNITION))
+async def preload_models(preload: PreloadModelData) -> None:
+    log.info(f"Preloading models: {preload}")
+    if preload.clip is not None:
+        model = await model_cache.get(preload.clip, ModelType.TEXTUAL, ModelTask.SEARCH)
+        await load(model)
+
+        model = await model_cache.get(preload.clip, ModelType.VISUAL, ModelTask.SEARCH)
+        await load(model)
+
+    if preload.facial_recognition is not None:
+        model = await model_cache.get(preload.facial_recognition, ModelType.DETECTION, ModelTask.FACIAL_RECOGNITION)
+        await load(model)
+
+        model = await model_cache.get(preload.facial_recognition, ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION)
+        await load(model)
 
 
 def update_state() -> Iterator[None]:
@@ -81,6 +102,27 @@ def update_state() -> Iterator[None]:
         active_requests -= 1
 
 
+def get_entries(entries: str = Form()) -> InferenceEntries:
+    try:
+        request: PipelineRequest = orjson.loads(entries)
+        without_deps: list[InferenceEntry] = []
+        with_deps: list[InferenceEntry] = []
+        for task, types in request.items():
+            for type, entry in types.items():
+                parsed: InferenceEntry = {
+                    "name": entry["modelName"],
+                    "task": task,
+                    "type": type,
+                    "options": entry.get("options", {}),
+                }
+                dep = get_model_deps(parsed["name"], type, task)
+                (with_deps if dep else without_deps).append(parsed)
+        return without_deps, with_deps
+    except (orjson.JSONDecodeError, ValidationError, KeyError, AttributeError) as e:
+        log.error(f"Invalid request format: {e}")
+        raise HTTPException(422, "Invalid request format.")
+
+
 app = FastAPI(lifespan=lifespan)
 
 
@@ -96,42 +138,63 @@ def ping() -> str:
 
 @app.post("/predict", dependencies=[Depends(update_state)])
 async def predict(
-    model_name: str = Form(alias="modelName"),
-    model_type: ModelType = Form(alias="modelType"),
-    options: str = Form(default="{}"),
+    entries: InferenceEntries = Depends(get_entries),
+    image: bytes | None = File(default=None),
     text: str | None = Form(default=None),
-    image: UploadFile | None = None,
 ) -> Any:
     if image is not None:
-        inputs: str | bytes = await image.read()
+        inputs: Image | str = await run(lambda: decode_pil(image))
     elif text is not None:
         inputs = text
     else:
         raise HTTPException(400, "Either image or text must be provided")
-    try:
-        kwargs = orjson.loads(options)
-    except orjson.JSONDecodeError:
-        raise HTTPException(400, f"Invalid options JSON: {options}")
-
-    model = await load(await model_cache.get(model_name, model_type, ttl=settings.model_ttl, **kwargs))
-    model.configure(**kwargs)
-    outputs = await run(model.predict, inputs)
-    return ORJSONResponse(outputs)
+    response = await run_inference(inputs, entries)
+    return ORJSONResponse(response)
 
 
-async def run(func: Callable[..., Any], inputs: Any) -> Any:
+async def run_inference(payload: Image | str, entries: InferenceEntries) -> InferenceResponse:
+    outputs: dict[ModelIdentity, Any] = {}
+    response: InferenceResponse = {}
+
+    async def _run_inference(entry: InferenceEntry) -> None:
+        model = await model_cache.get(entry["name"], entry["type"], entry["task"], ttl=settings.model_ttl)
+        inputs = [payload]
+        for dep in model.depends:
+            try:
+                inputs.append(outputs[dep])
+            except KeyError:
+                message = f"Task {entry['task']} of type {entry['type']} depends on output of {dep}"
+                raise HTTPException(400, message)
+        model = await load(model)
+        output = await run(model.predict, *inputs, **entry["options"])
+        outputs[model.identity] = output
+        response[entry["task"]] = output
+
+    without_deps, with_deps = entries
+    await asyncio.gather(*[_run_inference(entry) for entry in without_deps])
+    if with_deps:
+        await asyncio.gather(*[_run_inference(entry) for entry in with_deps])
+    if isinstance(payload, Image):
+        response["imageHeight"], response["imageWidth"] = payload.height, payload.width
+
+    return response
+
+
+async def run(func: Callable[..., T], *args: Any, **kwargs: Any) -> T:
     if thread_pool is None:
-        return func(inputs)
-    return await asyncio.get_running_loop().run_in_executor(thread_pool, func, inputs)
+        return func(*args, **kwargs)
+    partial_func = partial(func, *args, **kwargs)
+    return await asyncio.get_running_loop().run_in_executor(thread_pool, partial_func)
 
 
 async def load(model: InferenceModel) -> InferenceModel:
     if model.loaded:
         return model
 
-    def _load(model: InferenceModel) -> None:
+    def _load(model: InferenceModel) -> InferenceModel:
         with lock:
             model.load()
+        return model
 
     try:
         await run(_load, model)
diff --git a/machine-learning/app/models/__init__.py b/machine-learning/app/models/__init__.py
index 18d75e1632..25e726c64e 100644
--- a/machine-learning/app/models/__init__.py
+++ b/machine-learning/app/models/__init__.py
@@ -1,24 +1,40 @@
 from typing import Any
 
-from app.schemas import ModelType
+from app.models.base import InferenceModel
+from app.models.clip.textual import MClipTextualEncoder, OpenClipTextualEncoder
+from app.models.clip.visual import OpenClipVisualEncoder
+from app.schemas import ModelSource, ModelTask, ModelType
 
-from .base import InferenceModel
-from .clip import MCLIPEncoder, OpenCLIPEncoder
-from .constants import is_insightface, is_mclip, is_openclip
-from .facial_recognition import FaceRecognizer
+from .constants import get_model_source
+from .facial_recognition.detection import FaceDetector
+from .facial_recognition.recognition import FaceRecognizer
 
 
-def from_model_type(model_type: ModelType, model_name: str, **model_kwargs: Any) -> InferenceModel:
-    match model_type:
-        case ModelType.CLIP:
-            if is_openclip(model_name):
-                return OpenCLIPEncoder(model_name, **model_kwargs)
-            elif is_mclip(model_name):
-                return MCLIPEncoder(model_name, **model_kwargs)
-        case ModelType.FACIAL_RECOGNITION:
-            if is_insightface(model_name):
-                return FaceRecognizer(model_name, **model_kwargs)
+def get_model_class(model_name: str, model_type: ModelType, model_task: ModelTask) -> type[InferenceModel]:
+    source = get_model_source(model_name)
+    match source, model_type, model_task:
+        case ModelSource.OPENCLIP | ModelSource.MCLIP, ModelType.VISUAL, ModelTask.SEARCH:
+            return OpenClipVisualEncoder
+
+        case ModelSource.OPENCLIP, ModelType.TEXTUAL, ModelTask.SEARCH:
+            return OpenClipTextualEncoder
+
+        case ModelSource.MCLIP, ModelType.TEXTUAL, ModelTask.SEARCH:
+            return MClipTextualEncoder
+
+        case ModelSource.INSIGHTFACE, ModelType.DETECTION, ModelTask.FACIAL_RECOGNITION:
+            return FaceDetector
+
+        case ModelSource.INSIGHTFACE, ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION:
+            return FaceRecognizer
+
         case _:
-            raise ValueError(f"Unknown model type {model_type}")
+            raise ValueError(f"Unknown model combination: {source}, {model_type}, {model_task}")
 
-    raise ValueError(f"Unknown {model_type} model {model_name}")
+
+def from_model_type(model_name: str, model_type: ModelType, model_task: ModelTask, **kwargs: Any) -> InferenceModel:
+    return get_model_class(model_name, model_type, model_task)(model_name, **kwargs)
+
+
+def get_model_deps(model_name: str, model_type: ModelType, model_task: ModelTask) -> list[tuple[ModelType, ModelTask]]:
+    return get_model_class(model_name, model_type, model_task).depends
diff --git a/machine-learning/app/models/base.py b/machine-learning/app/models/base.py
index 16adc159ad..f64a873010 100644
--- a/machine-learning/app/models/base.py
+++ b/machine-learning/app/models/base.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 from abc import ABC, abstractmethod
 from pathlib import Path
 from shutil import rmtree
-from typing import Any
+from typing import Any, ClassVar
 
 import onnxruntime as ort
 from huggingface_hub import snapshot_download
@@ -11,13 +11,14 @@ from huggingface_hub import snapshot_download
 import ann.ann
 from app.models.constants import SUPPORTED_PROVIDERS
 
-from ..config import get_cache_dir, get_hf_model_name, log, settings
-from ..schemas import ModelRuntime, ModelType
+from ..config import clean_name, log, settings
+from ..schemas import ModelFormat, ModelIdentity, ModelSession, ModelTask, ModelType
 from .ann import AnnSession
 
 
 class InferenceModel(ABC):
-    _model_type: ModelType
+    depends: ClassVar[list[ModelIdentity]]
+    identity: ClassVar[ModelIdentity]
 
     def __init__(
         self,
@@ -26,16 +27,16 @@ class InferenceModel(ABC):
         providers: list[str] | None = None,
         provider_options: list[dict[str, Any]] | None = None,
         sess_options: ort.SessionOptions | None = None,
-        preferred_runtime: ModelRuntime | None = None,
+        preferred_format: ModelFormat | None = None,
         **model_kwargs: Any,
     ) -> None:
         self.loaded = False
-        self.model_name = model_name
+        self.model_name = clean_name(model_name)
         self.cache_dir = Path(cache_dir) if cache_dir is not None else self.cache_dir_default
         self.providers = providers if providers is not None else self.providers_default
         self.provider_options = provider_options if provider_options is not None else self.provider_options_default
         self.sess_options = sess_options if sess_options is not None else self.sess_options_default
-        self.preferred_runtime = preferred_runtime if preferred_runtime is not None else self.preferred_runtime_default
+        self.preferred_format = preferred_format if preferred_format is not None else self.preferred_format_default
 
     def download(self) -> None:
         if not self.cached:
@@ -47,35 +48,36 @@ class InferenceModel(ABC):
     def load(self) -> None:
         if self.loaded:
             return
+
         self.download()
         log.info(f"Loading {self.model_type.replace('-', ' ')} model '{self.model_name}' to memory")
-        self._load()
+        self.session = self._load()
         self.loaded = True
 
-    def predict(self, inputs: Any, **model_kwargs: Any) -> Any:
+    def predict(self, *inputs: Any, **model_kwargs: Any) -> Any:
         self.load()
         if model_kwargs:
             self.configure(**model_kwargs)
-        return self._predict(inputs)
+        return self._predict(*inputs, **model_kwargs)
 
     @abstractmethod
-    def _predict(self, inputs: Any) -> Any: ...
+    def _predict(self, *inputs: Any, **model_kwargs: Any) -> Any: ...
 
-    def configure(self, **model_kwargs: Any) -> None:
+    def configure(self, **kwargs: Any) -> None:
         pass
 
     def _download(self) -> None:
-        ignore_patterns = [] if self.preferred_runtime == ModelRuntime.ARMNN else ["*.armnn"]
+        ignore_patterns = [] if self.preferred_format == ModelFormat.ARMNN else ["*.armnn"]
         snapshot_download(
-            get_hf_model_name(self.model_name),
+            f"immich-app/{clean_name(self.model_name)}",
             cache_dir=self.cache_dir,
             local_dir=self.cache_dir,
             local_dir_use_symlinks=False,
             ignore_patterns=ignore_patterns,
         )
 
-    @abstractmethod
-    def _load(self) -> None: ...
+    def _load(self) -> ModelSession:
+        return self._make_session(self.model_path)
 
     def clear_cache(self) -> None:
         if not self.cache_dir.exists():
@@ -99,7 +101,7 @@ class InferenceModel(ABC):
             self.cache_dir.unlink()
         self.cache_dir.mkdir(parents=True, exist_ok=True)
 
-    def _make_session(self, model_path: Path) -> AnnSession | ort.InferenceSession:
+    def _make_session(self, model_path: Path) -> ModelSession:
         if not model_path.is_file():
             onnx_path = model_path.with_suffix(".onnx")
             if not onnx_path.is_file():
@@ -124,9 +126,21 @@ class InferenceModel(ABC):
                 raise ValueError(f"Unsupported model file type: {model_path.suffix}")
         return session
 
+    @property
+    def model_dir(self) -> Path:
+        return self.cache_dir / self.model_type.value
+
+    @property
+    def model_path(self) -> Path:
+        return self.model_dir / f"model.{self.preferred_format}"
+
+    @property
+    def model_task(self) -> ModelTask:
+        return self.identity[1]
+
     @property
     def model_type(self) -> ModelType:
-        return self._model_type
+        return self.identity[0]
 
     @property
     def cache_dir(self) -> Path:
@@ -138,11 +152,11 @@ class InferenceModel(ABC):
 
     @property
     def cache_dir_default(self) -> Path:
-        return get_cache_dir(self.model_name, self.model_type)
+        return settings.cache_folder / self.model_task.value / self.model_name
 
     @property
     def cached(self) -> bool:
-        return self.cache_dir.is_dir() and any(self.cache_dir.iterdir())
+        return self.model_path.is_file()
 
     @property
     def providers(self) -> list[str]:
@@ -226,14 +240,14 @@ class InferenceModel(ABC):
         return sess_options
 
     @property
-    def preferred_runtime(self) -> ModelRuntime:
-        return self._preferred_runtime
+    def preferred_format(self) -> ModelFormat:
+        return self._preferred_format
 
-    @preferred_runtime.setter
-    def preferred_runtime(self, preferred_runtime: ModelRuntime) -> None:
-        log.debug(f"Setting preferred runtime to {preferred_runtime}")
-        self._preferred_runtime = preferred_runtime
+    @preferred_format.setter
+    def preferred_format(self, preferred_format: ModelFormat) -> None:
+        log.debug(f"Setting preferred format to {preferred_format}")
+        self._preferred_format = preferred_format
 
     @property
-    def preferred_runtime_default(self) -> ModelRuntime:
-        return ModelRuntime.ARMNN if ann.ann.is_available and settings.ann else ModelRuntime.ONNX
+    def preferred_format_default(self) -> ModelFormat:
+        return ModelFormat.ARMNN if ann.ann.is_available and settings.ann else ModelFormat.ONNX
diff --git a/machine-learning/app/models/cache.py b/machine-learning/app/models/cache.py
index 781a9caea0..bf8e8a6352 100644
--- a/machine-learning/app/models/cache.py
+++ b/machine-learning/app/models/cache.py
@@ -5,9 +5,9 @@ from aiocache.lock import OptimisticLock
 from aiocache.plugins import TimingPlugin
 
 from app.models import from_model_type
+from app.models.base import InferenceModel
 
-from ..schemas import ModelType, has_profiling
-from .base import InferenceModel
+from ..schemas import ModelTask, ModelType, has_profiling
 
 
 class ModelCache:
@@ -31,28 +31,21 @@ class ModelCache:
         if profiling:
             plugins.append(TimingPlugin())
 
-        self.revalidate_enable = revalidate
+        self.should_revalidate = revalidate
 
         self.cache = SimpleMemoryCache(timeout=timeout, plugins=plugins, namespace=None)
 
-    async def get(self, model_name: str, model_type: ModelType, **model_kwargs: Any) -> InferenceModel:
-        """
-        Args:
-            model_name: Name of model in the model hub used for the task.
-            model_type: Model type or task, which determines which model zoo is used.
-
-        Returns:
-            model: The requested model.
-        """
-
-        key = f"{model_name}{model_type.value}{model_kwargs.get('mode', '')}"
+    async def get(
+        self, model_name: str, model_type: ModelType, model_task: ModelTask, **model_kwargs: Any
+    ) -> InferenceModel:
+        key = f"{model_name}{model_type}{model_task}"
 
         async with OptimisticLock(self.cache, key) as lock:
             model: InferenceModel | None = await self.cache.get(key)
             if model is None:
-                model = from_model_type(model_type, model_name, **model_kwargs)
+                model = from_model_type(model_name, model_type, model_task, **model_kwargs)
                 await lock.cas(model, ttl=model_kwargs.get("ttl", None))
-            elif self.revalidate_enable:
+            elif self.should_revalidate:
                 await self.revalidate(key, model_kwargs.get("ttl", None))
         return model
 
diff --git a/machine-learning/app/models/clip.py b/machine-learning/app/models/clip.py
deleted file mode 100644
index b0d73175a9..0000000000
--- a/machine-learning/app/models/clip.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import json
-from abc import abstractmethod
-from functools import cached_property
-from io import BytesIO
-from pathlib import Path
-from typing import Any, Literal
-
-import numpy as np
-from numpy.typing import NDArray
-from PIL import Image
-from tokenizers import Encoding, Tokenizer
-
-from app.config import clean_name, log
-from app.models.transforms import crop, get_pil_resampling, normalize, resize, to_numpy
-from app.schemas import ModelType
-
-from .base import InferenceModel
-
-
-class BaseCLIPEncoder(InferenceModel):
-    _model_type = ModelType.CLIP
-
-    def __init__(
-        self,
-        model_name: str,
-        cache_dir: Path | str | None = None,
-        mode: Literal["text", "vision"] | None = None,
-        **model_kwargs: Any,
-    ) -> None:
-        self.mode = mode
-        super().__init__(model_name, cache_dir, **model_kwargs)
-
-    def _load(self) -> None:
-        if self.mode == "text" or self.mode is None:
-            log.debug(f"Loading clip text model '{self.model_name}'")
-            self.text_model = self._make_session(self.textual_path)
-            log.debug(f"Loaded clip text model '{self.model_name}'")
-
-        if self.mode == "vision" or self.mode is None:
-            log.debug(f"Loading clip vision model '{self.model_name}'")
-            self.vision_model = self._make_session(self.visual_path)
-            log.debug(f"Loaded clip vision model '{self.model_name}'")
-
-    def _predict(self, image_or_text: Image.Image | str) -> NDArray[np.float32]:
-        if isinstance(image_or_text, bytes):
-            image_or_text = Image.open(BytesIO(image_or_text))
-
-        match image_or_text:
-            case Image.Image():
-                if self.mode == "text":
-                    raise TypeError("Cannot encode image as text-only model")
-                outputs: NDArray[np.float32] = self.vision_model.run(None, self.transform(image_or_text))[0][0]
-            case str():
-                if self.mode == "vision":
-                    raise TypeError("Cannot encode text as vision-only model")
-                outputs = self.text_model.run(None, self.tokenize(image_or_text))[0][0]
-            case _:
-                raise TypeError(f"Expected Image or str, but got: {type(image_or_text)}")
-
-        return outputs
-
-    @abstractmethod
-    def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]:
-        pass
-
-    @abstractmethod
-    def transform(self, image: Image.Image) -> dict[str, NDArray[np.float32]]:
-        pass
-
-    @property
-    def textual_dir(self) -> Path:
-        return self.cache_dir / "textual"
-
-    @property
-    def visual_dir(self) -> Path:
-        return self.cache_dir / "visual"
-
-    @property
-    def model_cfg_path(self) -> Path:
-        return self.cache_dir / "config.json"
-
-    @property
-    def textual_path(self) -> Path:
-        return self.textual_dir / f"model.{self.preferred_runtime}"
-
-    @property
-    def visual_path(self) -> Path:
-        return self.visual_dir / f"model.{self.preferred_runtime}"
-
-    @property
-    def tokenizer_file_path(self) -> Path:
-        return self.textual_dir / "tokenizer.json"
-
-    @property
-    def tokenizer_cfg_path(self) -> Path:
-        return self.textual_dir / "tokenizer_config.json"
-
-    @property
-    def preprocess_cfg_path(self) -> Path:
-        return self.visual_dir / "preprocess_cfg.json"
-
-    @property
-    def cached(self) -> bool:
-        return self.textual_path.is_file() and self.visual_path.is_file()
-
-    @cached_property
-    def model_cfg(self) -> dict[str, Any]:
-        log.debug(f"Loading model config for CLIP model '{self.model_name}'")
-        model_cfg: dict[str, Any] = json.load(self.model_cfg_path.open())
-        log.debug(f"Loaded model config for CLIP model '{self.model_name}'")
-        return model_cfg
-
-    @cached_property
-    def tokenizer_file(self) -> dict[str, Any]:
-        log.debug(f"Loading tokenizer file for CLIP model '{self.model_name}'")
-        tokenizer_file: dict[str, Any] = json.load(self.tokenizer_file_path.open())
-        log.debug(f"Loaded tokenizer file for CLIP model '{self.model_name}'")
-        return tokenizer_file
-
-    @cached_property
-    def tokenizer_cfg(self) -> dict[str, Any]:
-        log.debug(f"Loading tokenizer config for CLIP model '{self.model_name}'")
-        tokenizer_cfg: dict[str, Any] = json.load(self.tokenizer_cfg_path.open())
-        log.debug(f"Loaded tokenizer config for CLIP model '{self.model_name}'")
-        return tokenizer_cfg
-
-    @cached_property
-    def preprocess_cfg(self) -> dict[str, Any]:
-        log.debug(f"Loading visual preprocessing config for CLIP model '{self.model_name}'")
-        preprocess_cfg: dict[str, Any] = json.load(self.preprocess_cfg_path.open())
-        log.debug(f"Loaded visual preprocessing config for CLIP model '{self.model_name}'")
-        return preprocess_cfg
-
-
-class OpenCLIPEncoder(BaseCLIPEncoder):
-    def __init__(
-        self,
-        model_name: str,
-        cache_dir: Path | str | None = None,
-        mode: Literal["text", "vision"] | None = None,
-        **model_kwargs: Any,
-    ) -> None:
-        super().__init__(clean_name(model_name), cache_dir, mode, **model_kwargs)
-
-    def _load(self) -> None:
-        super()._load()
-        self._load_tokenizer()
-
-        size: list[int] | int = self.preprocess_cfg["size"]
-        self.size = size[0] if isinstance(size, list) else size
-
-        self.resampling = get_pil_resampling(self.preprocess_cfg["interpolation"])
-        self.mean = np.array(self.preprocess_cfg["mean"], dtype=np.float32)
-        self.std = np.array(self.preprocess_cfg["std"], dtype=np.float32)
-
-    def _load_tokenizer(self) -> Tokenizer:
-        log.debug(f"Loading tokenizer for CLIP model '{self.model_name}'")
-
-        text_cfg: dict[str, Any] = self.model_cfg["text_cfg"]
-        context_length: int = text_cfg.get("context_length", 77)
-        pad_token: str = self.tokenizer_cfg["pad_token"]
-
-        self.tokenizer: Tokenizer = Tokenizer.from_file(self.tokenizer_file_path.as_posix())
-
-        pad_id: int = self.tokenizer.token_to_id(pad_token)
-        self.tokenizer.enable_padding(length=context_length, pad_token=pad_token, pad_id=pad_id)
-        self.tokenizer.enable_truncation(max_length=context_length)
-
-        log.debug(f"Loaded tokenizer for CLIP model '{self.model_name}'")
-
-    def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]:
-        tokens: Encoding = self.tokenizer.encode(text)
-        return {"text": np.array([tokens.ids], dtype=np.int32)}
-
-    def transform(self, image: Image.Image) -> dict[str, NDArray[np.float32]]:
-        image = resize(image, self.size)
-        image = crop(image, self.size)
-        image_np = to_numpy(image)
-        image_np = normalize(image_np, self.mean, self.std)
-        return {"image": np.expand_dims(image_np.transpose(2, 0, 1), 0)}
-
-
-class MCLIPEncoder(OpenCLIPEncoder):
-    def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]:
-        tokens: Encoding = self.tokenizer.encode(text)
-        return {
-            "input_ids": np.array([tokens.ids], dtype=np.int32),
-            "attention_mask": np.array([tokens.attention_mask], dtype=np.int32),
-        }
diff --git a/machine-learning/app/models/clip/textual.py b/machine-learning/app/models/clip/textual.py
new file mode 100644
index 0000000000..4d86bf05b0
--- /dev/null
+++ b/machine-learning/app/models/clip/textual.py
@@ -0,0 +1,98 @@
+import json
+from abc import abstractmethod
+from functools import cached_property
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from numpy.typing import NDArray
+from tokenizers import Encoding, Tokenizer
+
+from app.config import log
+from app.models.base import InferenceModel
+from app.schemas import ModelSession, ModelTask, ModelType
+
+
+class BaseCLIPTextualEncoder(InferenceModel):
+    depends = []
+    identity = (ModelType.TEXTUAL, ModelTask.SEARCH)
+
+    def _predict(self, inputs: str, **kwargs: Any) -> NDArray[np.float32]:
+        res: NDArray[np.float32] = self.session.run(None, self.tokenize(inputs))[0][0]
+        return res
+
+    def _load(self) -> ModelSession:
+        log.debug(f"Loading tokenizer for CLIP model '{self.model_name}'")
+        self.tokenizer = self._load_tokenizer()
+        log.debug(f"Loaded tokenizer for CLIP model '{self.model_name}'")
+
+        return super()._load()
+
+    @abstractmethod
+    def _load_tokenizer(self) -> Tokenizer:
+        pass
+
+    @abstractmethod
+    def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]:
+        pass
+
+    @property
+    def model_cfg_path(self) -> Path:
+        return self.cache_dir / "config.json"
+
+    @property
+    def tokenizer_file_path(self) -> Path:
+        return self.model_dir / "tokenizer.json"
+
+    @property
+    def tokenizer_cfg_path(self) -> Path:
+        return self.model_dir / "tokenizer_config.json"
+
+    @cached_property
+    def model_cfg(self) -> dict[str, Any]:
+        log.debug(f"Loading model config for CLIP model '{self.model_name}'")
+        model_cfg: dict[str, Any] = json.load(self.model_cfg_path.open())
+        log.debug(f"Loaded model config for CLIP model '{self.model_name}'")
+        return model_cfg
+
+    @cached_property
+    def tokenizer_file(self) -> dict[str, Any]:
+        log.debug(f"Loading tokenizer file for CLIP model '{self.model_name}'")
+        tokenizer_file: dict[str, Any] = json.load(self.tokenizer_file_path.open())
+        log.debug(f"Loaded tokenizer file for CLIP model '{self.model_name}'")
+        return tokenizer_file
+
+    @cached_property
+    def tokenizer_cfg(self) -> dict[str, Any]:
+        log.debug(f"Loading tokenizer config for CLIP model '{self.model_name}'")
+        tokenizer_cfg: dict[str, Any] = json.load(self.tokenizer_cfg_path.open())
+        log.debug(f"Loaded tokenizer config for CLIP model '{self.model_name}'")
+        return tokenizer_cfg
+
+
+class OpenClipTextualEncoder(BaseCLIPTextualEncoder):
+    def _load_tokenizer(self) -> Tokenizer:
+        text_cfg: dict[str, Any] = self.model_cfg["text_cfg"]
+        context_length: int = text_cfg.get("context_length", 77)
+        pad_token: str = self.tokenizer_cfg["pad_token"]
+
+        tokenizer: Tokenizer = Tokenizer.from_file(self.tokenizer_file_path.as_posix())
+
+        pad_id: int = tokenizer.token_to_id(pad_token)
+        tokenizer.enable_padding(length=context_length, pad_token=pad_token, pad_id=pad_id)
+        tokenizer.enable_truncation(max_length=context_length)
+
+        return tokenizer
+
+    def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]:
+        tokens: Encoding = self.tokenizer.encode(text)
+        return {"text": np.array([tokens.ids], dtype=np.int32)}
+
+
+class MClipTextualEncoder(OpenClipTextualEncoder):
+    def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]:
+        tokens: Encoding = self.tokenizer.encode(text)
+        return {
+            "input_ids": np.array([tokens.ids], dtype=np.int32),
+            "attention_mask": np.array([tokens.attention_mask], dtype=np.int32),
+        }
diff --git a/machine-learning/app/models/clip/visual.py b/machine-learning/app/models/clip/visual.py
new file mode 100644
index 0000000000..48058c961a
--- /dev/null
+++ b/machine-learning/app/models/clip/visual.py
@@ -0,0 +1,69 @@
+import json
+from abc import abstractmethod
+from functools import cached_property
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from numpy.typing import NDArray
+from PIL import Image
+
+from app.config import log
+from app.models.base import InferenceModel
+from app.models.transforms import crop_pil, decode_pil, get_pil_resampling, normalize, resize_pil, to_numpy
+from app.schemas import ModelSession, ModelTask, ModelType
+
+
+class BaseCLIPVisualEncoder(InferenceModel):
+    depends = []
+    identity = (ModelType.VISUAL, ModelTask.SEARCH)
+
+    def _predict(self, inputs: Image.Image | bytes, **kwargs: Any) -> NDArray[np.float32]:
+        image = decode_pil(inputs)
+        res: NDArray[np.float32] = self.session.run(None, self.transform(image))[0][0]
+        return res
+
+    @abstractmethod
+    def transform(self, image: Image.Image) -> dict[str, NDArray[np.float32]]:
+        pass
+
+    @property
+    def model_cfg_path(self) -> Path:
+        return self.cache_dir / "config.json"
+
+    @property
+    def preprocess_cfg_path(self) -> Path:
+        return self.model_dir / "preprocess_cfg.json"
+
+    @cached_property
+    def model_cfg(self) -> dict[str, Any]:
+        log.debug(f"Loading model config for CLIP model '{self.model_name}'")
+        model_cfg: dict[str, Any] = json.load(self.model_cfg_path.open())
+        log.debug(f"Loaded model config for CLIP model '{self.model_name}'")
+        return model_cfg
+
+    @cached_property
+    def preprocess_cfg(self) -> dict[str, Any]:
+        log.debug(f"Loading visual preprocessing config for CLIP model '{self.model_name}'")
+        preprocess_cfg: dict[str, Any] = json.load(self.preprocess_cfg_path.open())
+        log.debug(f"Loaded visual preprocessing config for CLIP model '{self.model_name}'")
+        return preprocess_cfg
+
+
+class OpenClipVisualEncoder(BaseCLIPVisualEncoder):
+    def _load(self) -> ModelSession:
+        size: list[int] | int = self.preprocess_cfg["size"]
+        self.size = size[0] if isinstance(size, list) else size
+
+        self.resampling = get_pil_resampling(self.preprocess_cfg["interpolation"])
+        self.mean = np.array(self.preprocess_cfg["mean"], dtype=np.float32)
+        self.std = np.array(self.preprocess_cfg["std"], dtype=np.float32)
+
+        return super()._load()
+
+    def transform(self, image: Image.Image) -> dict[str, NDArray[np.float32]]:
+        image = resize_pil(image, self.size)
+        image = crop_pil(image, self.size)
+        image_np = to_numpy(image)
+        image_np = normalize(image_np, self.mean, self.std)
+        return {"image": np.expand_dims(image_np.transpose(2, 0, 1), 0)}
diff --git a/machine-learning/app/models/constants.py b/machine-learning/app/models/constants.py
index b112e9279d..c51dd3b66d 100644
--- a/machine-learning/app/models/constants.py
+++ b/machine-learning/app/models/constants.py
@@ -1,4 +1,5 @@
 from app.config import clean_name
+from app.schemas import ModelSource
 
 _OPENCLIP_MODELS = {
     "RN50__openai",
@@ -54,13 +55,16 @@ _INSIGHTFACE_MODELS = {
 SUPPORTED_PROVIDERS = ["CUDAExecutionProvider", "OpenVINOExecutionProvider", "CPUExecutionProvider"]
 
 
-def is_openclip(model_name: str) -> bool:
-    return clean_name(model_name) in _OPENCLIP_MODELS
+def get_model_source(model_name: str) -> ModelSource | None:
+    cleaned_name = clean_name(model_name)
 
+    if cleaned_name in _INSIGHTFACE_MODELS:
+        return ModelSource.INSIGHTFACE
 
-def is_mclip(model_name: str) -> bool:
-    return clean_name(model_name) in _MCLIP_MODELS
+    if cleaned_name in _MCLIP_MODELS:
+        return ModelSource.MCLIP
 
+    if cleaned_name in _OPENCLIP_MODELS:
+        return ModelSource.OPENCLIP
 
-def is_insightface(model_name: str) -> bool:
-    return clean_name(model_name) in _INSIGHTFACE_MODELS
+    return None
diff --git a/machine-learning/app/models/facial_recognition.py b/machine-learning/app/models/facial_recognition.py
deleted file mode 100644
index 894f5ec726..0000000000
--- a/machine-learning/app/models/facial_recognition.py
+++ /dev/null
@@ -1,90 +0,0 @@
-from pathlib import Path
-from typing import Any
-
-import cv2
-import numpy as np
-from insightface.model_zoo import ArcFaceONNX, RetinaFace
-from insightface.utils.face_align import norm_crop
-from numpy.typing import NDArray
-
-from app.config import clean_name
-from app.schemas import Face, ModelType, is_ndarray
-
-from .base import InferenceModel
-
-
-class FaceRecognizer(InferenceModel):
-    _model_type = ModelType.FACIAL_RECOGNITION
-
-    def __init__(
-        self,
-        model_name: str,
-        min_score: float = 0.7,
-        cache_dir: Path | str | None = None,
-        **model_kwargs: Any,
-    ) -> None:
-        self.min_score = model_kwargs.pop("minScore", min_score)
-        super().__init__(clean_name(model_name), cache_dir, **model_kwargs)
-
-    def _load(self) -> None:
-        self.det_model = RetinaFace(session=self._make_session(self.det_file))
-        self.rec_model = ArcFaceONNX(
-            self.rec_file.with_suffix(".onnx").as_posix(),
-            session=self._make_session(self.rec_file),
-        )
-
-        self.det_model.prepare(
-            ctx_id=0,
-            det_thresh=self.min_score,
-            input_size=(640, 640),
-        )
-        self.rec_model.prepare(ctx_id=0)
-
-    def _predict(self, image: NDArray[np.uint8] | bytes) -> list[Face]:
-        if isinstance(image, bytes):
-            decoded_image = cv2.imdecode(np.frombuffer(image, np.uint8), cv2.IMREAD_COLOR)
-        else:
-            decoded_image = image
-        assert is_ndarray(decoded_image, np.uint8)
-        bboxes, kpss = self.det_model.detect(decoded_image)
-        if bboxes.size == 0:
-            return []
-        assert is_ndarray(kpss, np.float32)
-
-        scores = bboxes[:, 4].tolist()
-        bboxes = bboxes[:, :4].round().tolist()
-
-        results = []
-        height, width, _ = decoded_image.shape
-        for (x1, y1, x2, y2), score, kps in zip(bboxes, scores, kpss):
-            cropped_img = norm_crop(decoded_image, kps)
-            embedding: NDArray[np.float32] = self.rec_model.get_feat(cropped_img)[0]
-            face: Face = {
-                "imageWidth": width,
-                "imageHeight": height,
-                "boundingBox": {
-                    "x1": x1,
-                    "y1": y1,
-                    "x2": x2,
-                    "y2": y2,
-                },
-                "score": score,
-                "embedding": embedding,
-            }
-            results.append(face)
-        return results
-
-    @property
-    def cached(self) -> bool:
-        return self.det_file.is_file() and self.rec_file.is_file()
-
-    @property
-    def det_file(self) -> Path:
-        return self.cache_dir / "detection" / f"model.{self.preferred_runtime}"
-
-    @property
-    def rec_file(self) -> Path:
-        return self.cache_dir / "recognition" / f"model.{self.preferred_runtime}"
-
-    def configure(self, **model_kwargs: Any) -> None:
-        self.det_model.det_thresh = model_kwargs.pop("minScore", self.det_model.det_thresh)
diff --git a/machine-learning/app/models/facial_recognition/detection.py b/machine-learning/app/models/facial_recognition/detection.py
new file mode 100644
index 0000000000..2efacd447e
--- /dev/null
+++ b/machine-learning/app/models/facial_recognition/detection.py
@@ -0,0 +1,48 @@
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from insightface.model_zoo import RetinaFace
+from numpy.typing import NDArray
+
+from app.models.base import InferenceModel
+from app.models.transforms import decode_cv2
+from app.schemas import FaceDetectionOutput, ModelSession, ModelTask, ModelType
+
+
+class FaceDetector(InferenceModel):
+    depends = []
+    identity = (ModelType.DETECTION, ModelTask.FACIAL_RECOGNITION)
+
+    def __init__(
+        self,
+        model_name: str,
+        min_score: float = 0.7,
+        cache_dir: Path | str | None = None,
+        **model_kwargs: Any,
+    ) -> None:
+        self.min_score = model_kwargs.pop("minScore", min_score)
+        super().__init__(model_name, cache_dir, **model_kwargs)
+
+    def _load(self) -> ModelSession:
+        session = self._make_session(self.model_path)
+        self.model = RetinaFace(session=session)
+        self.model.prepare(ctx_id=0, det_thresh=self.min_score, input_size=(640, 640))
+
+        return session
+
+    def _predict(self, inputs: NDArray[np.uint8] | bytes, **kwargs: Any) -> FaceDetectionOutput:
+        inputs = decode_cv2(inputs)
+
+        bboxes, landmarks = self._detect(inputs)
+        return {
+            "boxes": bboxes[:, :4].round(),
+            "scores": bboxes[:, 4],
+            "landmarks": landmarks,
+        }
+
+    def _detect(self, inputs: NDArray[np.uint8] | bytes) -> tuple[NDArray[np.float32], NDArray[np.float32]]:
+        return self.model.detect(inputs)  # type: ignore
+
+    def configure(self, **kwargs: Any) -> None:
+        self.model.det_thresh = kwargs.pop("minScore", self.model.det_thresh)
diff --git a/machine-learning/app/models/facial_recognition/recognition.py b/machine-learning/app/models/facial_recognition/recognition.py
new file mode 100644
index 0000000000..cb8093dd95
--- /dev/null
+++ b/machine-learning/app/models/facial_recognition/recognition.py
@@ -0,0 +1,77 @@
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import onnx
+import onnxruntime as ort
+from insightface.model_zoo import ArcFaceONNX
+from insightface.utils.face_align import norm_crop
+from numpy.typing import NDArray
+from onnx.tools.update_model_dims import update_inputs_outputs_dims
+from PIL import Image
+
+from app.config import clean_name, log
+from app.models.base import InferenceModel
+from app.models.transforms import decode_cv2
+from app.schemas import FaceDetectionOutput, FacialRecognitionOutput, ModelSession, ModelTask, ModelType
+
+
+class FaceRecognizer(InferenceModel):
+    depends = [(ModelType.DETECTION, ModelTask.FACIAL_RECOGNITION)]
+    identity = (ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION)
+
+    def __init__(
+        self,
+        model_name: str,
+        min_score: float = 0.7,
+        cache_dir: Path | str | None = None,
+        **model_kwargs: Any,
+    ) -> None:
+        self.min_score = model_kwargs.pop("minScore", min_score)
+        super().__init__(clean_name(model_name), cache_dir, **model_kwargs)
+
+    def _load(self) -> ModelSession:
+        session = self._make_session(self.model_path)
+        if not self._has_batch_dim(session):
+            self._add_batch_dim(self.model_path)
+            session = self._make_session(self.model_path)
+        self.model = ArcFaceONNX(
+            self.model_path.with_suffix(".onnx").as_posix(),
+            session=session,
+        )
+        return session
+
+    def _predict(
+        self, inputs: NDArray[np.uint8] | bytes | Image.Image, faces: FaceDetectionOutput, **kwargs: Any
+    ) -> FacialRecognitionOutput:
+        if faces["boxes"].shape[0] == 0:
+            return []
+        inputs = decode_cv2(inputs)
+        embeddings: NDArray[np.float32] = self.model.get_feat(self._crop(inputs, faces))
+        return self.postprocess(faces, embeddings)
+
+    def postprocess(self, faces: FaceDetectionOutput, embeddings: NDArray[np.float32]) -> FacialRecognitionOutput:
+        return [
+            {
+                "boundingBox": {"x1": x1, "y1": y1, "x2": x2, "y2": y2},
+                "embedding": embedding,
+                "score": score,
+            }
+            for (x1, y1, x2, y2), embedding, score in zip(faces["boxes"], embeddings, faces["scores"])
+        ]
+
+    def _crop(self, image: NDArray[np.uint8], faces: FaceDetectionOutput) -> list[NDArray[np.uint8]]:
+        return [norm_crop(image, landmark) for landmark in faces["landmarks"]]
+
+    def _has_batch_dim(self, session: ort.InferenceSession) -> bool:
+        return not isinstance(session, ort.InferenceSession) or session.get_inputs()[0].shape[0] == "batch"
+
+    def _add_batch_dim(self, model_path: Path) -> None:
+        log.debug(f"Adding batch dimension to model {model_path}")
+        proto = onnx.load(model_path)
+        static_input_dims = [shape.dim_value for shape in proto.graph.input[0].type.tensor_type.shape.dim[1:]]
+        static_output_dims = [shape.dim_value for shape in proto.graph.output[0].type.tensor_type.shape.dim[1:]]
+        input_dims = {proto.graph.input[0].name: ["batch"] + static_input_dims}
+        output_dims = {proto.graph.output[0].name: ["batch"] + static_output_dims}
+        updated_proto = update_inputs_outputs_dims(proto, input_dims, output_dims)
+        onnx.save(updated_proto, model_path)
diff --git a/machine-learning/app/models/session.py b/machine-learning/app/models/session.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/machine-learning/app/models/transforms.py b/machine-learning/app/models/transforms.py
index 7ed001b8c2..ababdac99f 100644
--- a/machine-learning/app/models/transforms.py
+++ b/machine-learning/app/models/transforms.py
@@ -1,3 +1,7 @@
+from io import BytesIO
+from typing import IO
+
+import cv2
 import numpy as np
 from numpy.typing import NDArray
 from PIL import Image
@@ -5,7 +9,7 @@ from PIL import Image
 _PIL_RESAMPLING_METHODS = {resampling.name.lower(): resampling for resampling in Image.Resampling}
 
 
-def resize(img: Image.Image, size: int) -> Image.Image:
+def resize_pil(img: Image.Image, size: int) -> Image.Image:
     if img.width < img.height:
         return img.resize((size, int((img.height / img.width) * size)), resample=Image.Resampling.BICUBIC)
     else:
@@ -13,7 +17,7 @@ def resize(img: Image.Image, size: int) -> Image.Image:
 
 
 # https://stackoverflow.com/a/60883103
-def crop(img: Image.Image, size: int) -> Image.Image:
+def crop_pil(img: Image.Image, size: int) -> Image.Image:
     left = int((img.size[0] / 2) - (size / 2))
     upper = int((img.size[1] / 2) - (size / 2))
     right = left + size
@@ -23,14 +27,36 @@ def crop(img: Image.Image, size: int) -> Image.Image:
 
 
 def to_numpy(img: Image.Image) -> NDArray[np.float32]:
-    return np.asarray(img.convert("RGB")).astype(np.float32) / 255.0
+    return np.asarray(img if img.mode == "RGB" else img.convert("RGB"), dtype=np.float32) / 255.0
 
 
 def normalize(
     img: NDArray[np.float32], mean: float | NDArray[np.float32], std: float | NDArray[np.float32]
 ) -> NDArray[np.float32]:
-    return (img - mean) / std
+    return np.divide(img - mean, std, dtype=np.float32)
 
 
 def get_pil_resampling(resample: str) -> Image.Resampling:
     return _PIL_RESAMPLING_METHODS[resample.lower()]
+
+
+def pil_to_cv2(image: Image.Image) -> NDArray[np.uint8]:
+    return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)  # type: ignore
+
+
+def decode_pil(image_bytes: bytes | IO[bytes] | Image.Image) -> Image.Image:
+    if isinstance(image_bytes, Image.Image):
+        return image_bytes
+    image = Image.open(BytesIO(image_bytes) if isinstance(image_bytes, bytes) else image_bytes)
+    image.load()  # type: ignore
+    if not image.mode == "RGB":
+        image = image.convert("RGB")
+    return image
+
+
+def decode_cv2(image_bytes: NDArray[np.uint8] | bytes | Image.Image) -> NDArray[np.uint8]:
+    if isinstance(image_bytes, bytes):
+        image_bytes = decode_pil(image_bytes)  # pillow is much faster than cv2
+    if isinstance(image_bytes, Image.Image):
+        return pil_to_cv2(image_bytes)
+    return image_bytes
diff --git a/machine-learning/app/schemas.py b/machine-learning/app/schemas.py
index e2a027e1b8..739098a6bf 100644
--- a/machine-learning/app/schemas.py
+++ b/machine-learning/app/schemas.py
@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Any, Protocol, TypedDict, TypeGuard
+from typing import Any, Literal, Protocol, TypedDict, TypeGuard, TypeVar
 
 import numpy as np
 import numpy.typing as npt
@@ -28,31 +28,87 @@ class BoundingBox(TypedDict):
     y2: int
 
 
-class ModelType(StrEnum):
-    CLIP = "clip"
+class ModelTask(StrEnum):
     FACIAL_RECOGNITION = "facial-recognition"
+    SEARCH = "clip"
 
 
-class ModelRuntime(StrEnum):
-    ONNX = "onnx"
+class ModelType(StrEnum):
+    DETECTION = "detection"
+    RECOGNITION = "recognition"
+    TEXTUAL = "textual"
+    VISUAL = "visual"
+
+
+class ModelFormat(StrEnum):
     ARMNN = "armnn"
+    ONNX = "onnx"
+
+
+class ModelSource(StrEnum):
+    INSIGHTFACE = "insightface"
+    MCLIP = "mclip"
+    OPENCLIP = "openclip"
+
+
+ModelIdentity = tuple[ModelType, ModelTask]
+
+
+class ModelSession(Protocol):
+    def run(
+        self,
+        output_names: list[str] | None,
+        input_feed: dict[str, npt.NDArray[np.float32]] | dict[str, npt.NDArray[np.int32]],
+        run_options: Any = None,
+    ) -> list[npt.NDArray[np.float32]]: ...
 
 
 class HasProfiling(Protocol):
     profiling: dict[str, float]
 
 
-class Face(TypedDict):
+class FaceDetectionOutput(TypedDict):
+    boxes: npt.NDArray[np.float32]
+    scores: npt.NDArray[np.float32]
+    landmarks: npt.NDArray[np.float32]
+
+
+class DetectedFace(TypedDict):
     boundingBox: BoundingBox
     embedding: npt.NDArray[np.float32]
-    imageWidth: int
-    imageHeight: int
     score: float
 
 
+FacialRecognitionOutput = list[DetectedFace]
+
+
+class PipelineEntry(TypedDict):
+    modelName: str
+    options: dict[str, Any]
+
+
+PipelineRequest = dict[ModelTask, dict[ModelType, PipelineEntry]]
+
+
+class InferenceEntry(TypedDict):
+    name: str
+    task: ModelTask
+    type: ModelType
+    options: dict[str, Any]
+
+
+InferenceEntries = tuple[list[InferenceEntry], list[InferenceEntry]]
+
+
+InferenceResponse = dict[ModelTask | Literal["imageHeight"] | Literal["imageWidth"], Any]
+
+
 def has_profiling(obj: Any) -> TypeGuard[HasProfiling]:
     return hasattr(obj, "profiling") and isinstance(obj.profiling, dict)
 
 
 def is_ndarray(obj: Any, dtype: "type[np._DTypeScalar_co]") -> "TypeGuard[npt.NDArray[np._DTypeScalar_co]]":
     return isinstance(obj, np.ndarray) and obj.dtype == dtype
+
+
+T = TypeVar("T")
diff --git a/machine-learning/app/test_main.py b/machine-learning/app/test_main.py
index 22038a6544..d9d1455bd1 100644
--- a/machine-learning/app/test_main.py
+++ b/machine-learning/app/test_main.py
@@ -17,13 +17,15 @@ from pytest import MonkeyPatch
 from pytest_mock import MockerFixture
 
 from app.main import load, preload_models
+from app.models.clip.textual import MClipTextualEncoder, OpenClipTextualEncoder
+from app.models.clip.visual import OpenClipVisualEncoder
+from app.models.facial_recognition.detection import FaceDetector
+from app.models.facial_recognition.recognition import FaceRecognizer
 
 from .config import Settings, log, settings
 from .models.base import InferenceModel
 from .models.cache import ModelCache
-from .models.clip import MCLIPEncoder, OpenCLIPEncoder
-from .models.facial_recognition import FaceRecognizer
-from .schemas import ModelRuntime, ModelType
+from .schemas import ModelFormat, ModelTask, ModelType
 
 
 class TestBase:
@@ -35,13 +37,13 @@ class TestBase:
 
     @pytest.mark.providers(CPU_EP)
     def test_sets_cpu_provider(self, providers: list[str]) -> None:
-        encoder = OpenCLIPEncoder("ViT-B-32__openai")
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai")
 
         assert encoder.providers == self.CPU_EP
 
     @pytest.mark.providers(CUDA_EP)
     def test_sets_cuda_provider_if_available(self, providers: list[str]) -> None:
-        encoder = OpenCLIPEncoder("ViT-B-32__openai")
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai")
 
         assert encoder.providers == self.CUDA_EP
 
@@ -50,7 +52,7 @@ class TestBase:
         mocked = mocker.patch("app.models.base.ort.capi._pybind_state")
         mocked.get_available_openvino_device_ids.return_value = ["GPU.0", "CPU"]
 
-        encoder = OpenCLIPEncoder("ViT-B-32__openai")
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai")
 
         assert encoder.providers == self.OV_EP
 
@@ -59,25 +61,25 @@ class TestBase:
         mocked = mocker.patch("app.models.base.ort.capi._pybind_state")
         mocked.get_available_openvino_device_ids.return_value = ["CPU"]
 
-        encoder = OpenCLIPEncoder("ViT-B-32__openai")
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai")
 
         assert encoder.providers == self.CPU_EP
 
     @pytest.mark.providers(CUDA_EP_OUT_OF_ORDER)
     def test_sets_providers_in_correct_order(self, providers: list[str]) -> None:
-        encoder = OpenCLIPEncoder("ViT-B-32__openai")
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai")
 
         assert encoder.providers == self.CUDA_EP
 
     @pytest.mark.providers(TRT_EP)
     def test_ignores_unsupported_providers(self, providers: list[str]) -> None:
-        encoder = OpenCLIPEncoder("ViT-B-32__openai")
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai")
 
         assert encoder.providers == self.CUDA_EP
 
     def test_sets_provider_kwarg(self) -> None:
         providers = ["CUDAExecutionProvider"]
-        encoder = OpenCLIPEncoder("ViT-B-32__openai", providers=providers)
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai", providers=providers)
 
         assert encoder.providers == providers
 
@@ -85,7 +87,9 @@ class TestBase:
         mocked = mocker.patch("app.models.base.ort.capi._pybind_state")
         mocked.get_available_openvino_device_ids.return_value = ["GPU.0", "CPU"]
 
-        encoder = OpenCLIPEncoder("ViT-B-32__openai", providers=["OpenVINOExecutionProvider", "CPUExecutionProvider"])
+        encoder = OpenClipTextualEncoder(
+            "ViT-B-32__openai", providers=["OpenVINOExecutionProvider", "CPUExecutionProvider"]
+        )
 
         assert encoder.provider_options == [
             {"device_type": "GPU_FP32", "cache_dir": (encoder.cache_dir / "openvino").as_posix()},
@@ -93,7 +97,7 @@ class TestBase:
         ]
 
     def test_sets_provider_options_kwarg(self) -> None:
-        encoder = OpenCLIPEncoder(
+        encoder = OpenClipTextualEncoder(
             "ViT-B-32__openai",
             providers=["OpenVINOExecutionProvider", "CPUExecutionProvider"],
             provider_options=[],
@@ -102,7 +106,7 @@ class TestBase:
         assert encoder.provider_options == []
 
     def test_sets_default_sess_options(self) -> None:
-        encoder = OpenCLIPEncoder("ViT-B-32__openai")
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai")
 
         assert encoder.sess_options.execution_mode == ort.ExecutionMode.ORT_SEQUENTIAL
         assert encoder.sess_options.inter_op_num_threads == 1
@@ -110,7 +114,9 @@ class TestBase:
         assert encoder.sess_options.enable_cpu_mem_arena is False
 
     def test_sets_default_sess_options_does_not_set_threads_if_non_cpu_and_default_threads(self) -> None:
-        encoder = OpenCLIPEncoder("ViT-B-32__openai", providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
+        encoder = OpenClipTextualEncoder(
+            "ViT-B-32__openai", providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
+        )
 
         assert encoder.sess_options.inter_op_num_threads == 0
         assert encoder.sess_options.intra_op_num_threads == 0
@@ -120,14 +126,16 @@ class TestBase:
         mock_settings.model_inter_op_threads = 2
         mock_settings.model_intra_op_threads = 4
 
-        encoder = OpenCLIPEncoder("ViT-B-32__openai", providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
+        encoder = OpenClipTextualEncoder(
+            "ViT-B-32__openai", providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
+        )
 
         assert encoder.sess_options.inter_op_num_threads == 2
         assert encoder.sess_options.intra_op_num_threads == 4
 
     def test_sets_sess_options_kwarg(self) -> None:
         sess_options = ort.SessionOptions()
-        encoder = OpenCLIPEncoder(
+        encoder = OpenClipTextualEncoder(
             "ViT-B-32__openai",
             providers=["OpenVINOExecutionProvider", "CPUExecutionProvider"],
             provider_options=[],
@@ -137,43 +145,43 @@ class TestBase:
         assert sess_options is encoder.sess_options
 
     def test_sets_default_cache_dir(self) -> None:
-        encoder = OpenCLIPEncoder("ViT-B-32__openai")
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai")
 
         assert encoder.cache_dir == Path(settings.cache_folder) / "clip" / "ViT-B-32__openai"
 
     def test_sets_cache_dir_kwarg(self) -> None:
         cache_dir = Path("/test_cache")
-        encoder = OpenCLIPEncoder("ViT-B-32__openai", cache_dir=cache_dir)
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir=cache_dir)
 
         assert encoder.cache_dir == cache_dir
 
-    def test_sets_default_preferred_runtime(self, mocker: MockerFixture) -> None:
+    def test_sets_default_preferred_format(self, mocker: MockerFixture) -> None:
         mocker.patch.object(settings, "ann", True)
         mocker.patch("ann.ann.is_available", False)
 
-        encoder = OpenCLIPEncoder("ViT-B-32__openai")
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai")
 
-        assert encoder.preferred_runtime == ModelRuntime.ONNX
+        assert encoder.preferred_format == ModelFormat.ONNX
 
-    def test_sets_default_preferred_runtime_to_armnn_if_available(self, mocker: MockerFixture) -> None:
+    def test_sets_default_preferred_format_to_armnn_if_available(self, mocker: MockerFixture) -> None:
         mocker.patch.object(settings, "ann", True)
         mocker.patch("ann.ann.is_available", True)
 
-        encoder = OpenCLIPEncoder("ViT-B-32__openai")
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai")
 
-        assert encoder.preferred_runtime == ModelRuntime.ARMNN
+        assert encoder.preferred_format == ModelFormat.ARMNN
 
-    def test_sets_preferred_runtime_kwarg(self, mocker: MockerFixture) -> None:
+    def test_sets_preferred_format_kwarg(self, mocker: MockerFixture) -> None:
         mocker.patch.object(settings, "ann", False)
         mocker.patch("ann.ann.is_available", False)
 
-        encoder = OpenCLIPEncoder("ViT-B-32__openai", preferred_runtime=ModelRuntime.ARMNN)
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai", preferred_format=ModelFormat.ARMNN)
 
-        assert encoder.preferred_runtime == ModelRuntime.ARMNN
+        assert encoder.preferred_format == ModelFormat.ARMNN
 
     def test_casts_cache_dir_string_to_path(self) -> None:
         cache_dir = "/test_cache"
-        encoder = OpenCLIPEncoder("ViT-B-32__openai", cache_dir=cache_dir)
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir=cache_dir)
 
         assert encoder.cache_dir == Path(cache_dir)
 
@@ -186,7 +194,7 @@ class TestBase:
         mocker.patch("app.models.base.Path", return_value=mock_cache_dir)
         info = mocker.spy(log, "info")
 
-        encoder = OpenCLIPEncoder("ViT-B-32__openai", cache_dir=mock_cache_dir)
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir=mock_cache_dir)
         encoder.clear_cache()
 
         mock_rmtree.assert_called_once_with(encoder.cache_dir)
@@ -201,7 +209,7 @@ class TestBase:
         mocker.patch("app.models.base.Path", return_value=mock_cache_dir)
         warning = mocker.spy(log, "warning")
 
-        encoder = OpenCLIPEncoder("ViT-B-32__openai", cache_dir=mock_cache_dir)
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir=mock_cache_dir)
         encoder.clear_cache()
 
         mock_rmtree.assert_not_called()
@@ -215,7 +223,7 @@ class TestBase:
         mock_cache_dir.is_dir.return_value = True
         mocker.patch("app.models.base.Path", return_value=mock_cache_dir)
 
-        encoder = OpenCLIPEncoder("ViT-B-32__openai", cache_dir=mock_cache_dir)
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir=mock_cache_dir)
         with pytest.raises(RuntimeError):
             encoder.clear_cache()
 
@@ -230,7 +238,7 @@ class TestBase:
         mocker.patch("app.models.base.Path", return_value=mock_cache_dir)
         warning = mocker.spy(log, "warning")
 
-        encoder = OpenCLIPEncoder("ViT-B-32__openai", cache_dir=mock_cache_dir)
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir=mock_cache_dir)
         encoder.clear_cache()
 
         mock_rmtree.assert_not_called()
@@ -245,7 +253,7 @@ class TestBase:
         mock_model_path.with_suffix.return_value = mock_model_path
         mock_ann = mocker.patch("app.models.base.AnnSession")
 
-        encoder = OpenCLIPEncoder("ViT-B-32__openai")
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai")
         encoder._make_session(mock_model_path)
 
         mock_ann.assert_called_once()
@@ -263,7 +271,7 @@ class TestBase:
         mock_ann = mocker.patch("app.models.base.AnnSession")
         mock_ort = mocker.patch("app.models.base.ort.InferenceSession")
 
-        encoder = OpenCLIPEncoder("ViT-B-32__openai")
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai")
         encoder._make_session(mock_armnn_path)
 
         mock_ort.assert_called_once()
@@ -277,7 +285,7 @@ class TestBase:
         mock_ann = mocker.patch("app.models.base.AnnSession")
         mock_ort = mocker.patch("app.models.base.ort.InferenceSession")
 
-        encoder = OpenCLIPEncoder("ViT-B-32__openai")
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai")
         with pytest.raises(ValueError):
             encoder._make_session(mock_model_path)
 
@@ -287,7 +295,7 @@ class TestBase:
     def test_download(self, mocker: MockerFixture) -> None:
         mock_snapshot_download = mocker.patch("app.models.base.snapshot_download")
 
-        encoder = OpenCLIPEncoder("ViT-B-32__openai", cache_dir="/path/to/cache")
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir="/path/to/cache")
         encoder.download()
 
         mock_snapshot_download.assert_called_once_with(
@@ -298,10 +306,10 @@ class TestBase:
             ignore_patterns=["*.armnn"],
         )
 
-    def test_download_downloads_armnn_if_preferred_runtime(self, mocker: MockerFixture) -> None:
+    def test_download_downloads_armnn_if_preferred_format(self, mocker: MockerFixture) -> None:
         mock_snapshot_download = mocker.patch("app.models.base.snapshot_download")
 
-        encoder = OpenCLIPEncoder("ViT-B-32__openai", preferred_runtime=ModelRuntime.ARMNN)
+        encoder = OpenClipTextualEncoder("ViT-B-32__openai", preferred_format=ModelFormat.ARMNN)
         encoder.download()
 
         mock_snapshot_download.assert_called_once_with(
@@ -323,21 +331,17 @@ class TestCLIP:
         mocker: MockerFixture,
         clip_model_cfg: dict[str, Any],
         clip_preprocess_cfg: Callable[[Path], dict[str, Any]],
-        clip_tokenizer_cfg: Callable[[Path], dict[str, Any]],
     ) -> None:
-        mocker.patch.object(OpenCLIPEncoder, "download")
-        mocker.patch.object(OpenCLIPEncoder, "model_cfg", clip_model_cfg)
-        mocker.patch.object(OpenCLIPEncoder, "preprocess_cfg", clip_preprocess_cfg)
-        mocker.patch.object(OpenCLIPEncoder, "tokenizer_cfg", clip_tokenizer_cfg)
+        mocker.patch.object(OpenClipVisualEncoder, "download")
+        mocker.patch.object(OpenClipVisualEncoder, "model_cfg", clip_model_cfg)
+        mocker.patch.object(OpenClipVisualEncoder, "preprocess_cfg", clip_preprocess_cfg)
 
         mocked = mocker.patch.object(InferenceModel, "_make_session", autospec=True).return_value
         mocked.run.return_value = [[self.embedding]]
-        mocker.patch("app.models.clip.Tokenizer.from_file", autospec=True)
 
-        clip_encoder = OpenCLIPEncoder("ViT-B-32__openai", cache_dir="test_cache", mode="vision")
+        clip_encoder = OpenClipVisualEncoder("ViT-B-32__openai", cache_dir="test_cache")
         embedding = clip_encoder.predict(pil_image)
 
-        assert clip_encoder.mode == "vision"
         assert isinstance(embedding, np.ndarray)
         assert embedding.shape[0] == clip_model_cfg["embed_dim"]
         assert embedding.dtype == np.float32
@@ -347,22 +351,19 @@ class TestCLIP:
         self,
         mocker: MockerFixture,
         clip_model_cfg: dict[str, Any],
-        clip_preprocess_cfg: Callable[[Path], dict[str, Any]],
         clip_tokenizer_cfg: Callable[[Path], dict[str, Any]],
     ) -> None:
-        mocker.patch.object(OpenCLIPEncoder, "download")
-        mocker.patch.object(OpenCLIPEncoder, "model_cfg", clip_model_cfg)
-        mocker.patch.object(OpenCLIPEncoder, "preprocess_cfg", clip_preprocess_cfg)
-        mocker.patch.object(OpenCLIPEncoder, "tokenizer_cfg", clip_tokenizer_cfg)
+        mocker.patch.object(OpenClipTextualEncoder, "download")
+        mocker.patch.object(OpenClipTextualEncoder, "model_cfg", clip_model_cfg)
+        mocker.patch.object(OpenClipTextualEncoder, "tokenizer_cfg", clip_tokenizer_cfg)
 
         mocked = mocker.patch.object(InferenceModel, "_make_session", autospec=True).return_value
         mocked.run.return_value = [[self.embedding]]
-        mocker.patch("app.models.clip.Tokenizer.from_file", autospec=True)
+        mocker.patch("app.models.clip.textual.Tokenizer.from_file", autospec=True)
 
-        clip_encoder = OpenCLIPEncoder("ViT-B-32__openai", cache_dir="test_cache", mode="text")
+        clip_encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir="test_cache")
         embedding = clip_encoder.predict("test search query")
 
-        assert clip_encoder.mode == "text"
         assert isinstance(embedding, np.ndarray)
         assert embedding.shape[0] == clip_model_cfg["embed_dim"]
         assert embedding.dtype == np.float32
@@ -372,19 +373,18 @@ class TestCLIP:
         self,
         mocker: MockerFixture,
         clip_model_cfg: dict[str, Any],
-        clip_preprocess_cfg: Callable[[Path], dict[str, Any]],
         clip_tokenizer_cfg: Callable[[Path], dict[str, Any]],
     ) -> None:
-        mocker.patch.object(OpenCLIPEncoder, "download")
-        mocker.patch.object(OpenCLIPEncoder, "model_cfg", clip_model_cfg)
-        mocker.patch.object(OpenCLIPEncoder, "preprocess_cfg", clip_preprocess_cfg)
-        mocker.patch.object(OpenCLIPEncoder, "tokenizer_cfg", clip_tokenizer_cfg)
-        mock_tokenizer = mocker.patch("app.models.clip.Tokenizer.from_file", autospec=True).return_value
+        mocker.patch.object(OpenClipTextualEncoder, "download")
+        mocker.patch.object(OpenClipTextualEncoder, "model_cfg", clip_model_cfg)
+        mocker.patch.object(OpenClipTextualEncoder, "tokenizer_cfg", clip_tokenizer_cfg)
+        mocker.patch.object(InferenceModel, "_make_session", autospec=True).return_value
+        mock_tokenizer = mocker.patch("app.models.clip.textual.Tokenizer.from_file", autospec=True).return_value
         mock_ids = [randint(0, 50000) for _ in range(77)]
         mock_tokenizer.encode.return_value = SimpleNamespace(ids=mock_ids)
 
-        clip_encoder = OpenCLIPEncoder("ViT-B-32__openai", cache_dir="test_cache", mode="text")
-        clip_encoder._load_tokenizer()
+        clip_encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir="test_cache")
+        clip_encoder._load()
         tokens = clip_encoder.tokenize("test search query")
 
         assert "text" in tokens
@@ -397,20 +397,19 @@ class TestCLIP:
         self,
         mocker: MockerFixture,
         clip_model_cfg: dict[str, Any],
-        clip_preprocess_cfg: Callable[[Path], dict[str, Any]],
         clip_tokenizer_cfg: Callable[[Path], dict[str, Any]],
     ) -> None:
-        mocker.patch.object(OpenCLIPEncoder, "download")
-        mocker.patch.object(OpenCLIPEncoder, "model_cfg", clip_model_cfg)
-        mocker.patch.object(OpenCLIPEncoder, "preprocess_cfg", clip_preprocess_cfg)
-        mocker.patch.object(OpenCLIPEncoder, "tokenizer_cfg", clip_tokenizer_cfg)
-        mock_tokenizer = mocker.patch("app.models.clip.Tokenizer.from_file", autospec=True).return_value
+        mocker.patch.object(MClipTextualEncoder, "download")
+        mocker.patch.object(MClipTextualEncoder, "model_cfg", clip_model_cfg)
+        mocker.patch.object(MClipTextualEncoder, "tokenizer_cfg", clip_tokenizer_cfg)
+        mocker.patch.object(InferenceModel, "_make_session", autospec=True).return_value
+        mock_tokenizer = mocker.patch("app.models.clip.textual.Tokenizer.from_file", autospec=True).return_value
         mock_ids = [randint(0, 50000) for _ in range(77)]
         mock_attention_mask = [randint(0, 1) for _ in range(77)]
         mock_tokenizer.encode.return_value = SimpleNamespace(ids=mock_ids, attention_mask=mock_attention_mask)
 
-        clip_encoder = MCLIPEncoder("ViT-B-32__openai", cache_dir="test_cache", mode="text")
-        clip_encoder._load_tokenizer()
+        clip_encoder = MClipTextualEncoder("ViT-B-32__openai", cache_dir="test_cache")
+        clip_encoder._load()
         tokens = clip_encoder.tokenize("test search query")
 
         assert "input_ids" in tokens
@@ -430,59 +429,90 @@ class TestFaceRecognition:
 
         assert face_recognizer.min_score == 0.5
 
-    def test_basic(self, cv_image: cv2.Mat, mocker: MockerFixture) -> None:
-        mocker.patch.object(FaceRecognizer, "load")
-        face_recognizer = FaceRecognizer("buffalo_s", min_score=0.0, cache_dir="test_cache")
+    def test_detection(self, cv_image: cv2.Mat, mocker: MockerFixture) -> None:
+        mocker.patch.object(FaceDetector, "load")
+        face_detector = FaceDetector("buffalo_s", min_score=0.0, cache_dir="test_cache")
 
         det_model = mock.Mock()
         num_faces = 2
         bbox = np.random.rand(num_faces, 4).astype(np.float32)
-        score = np.array([[0.67]] * num_faces).astype(np.float32)
+        scores = np.array([[0.67]] * num_faces).astype(np.float32)
         kpss = np.random.rand(num_faces, 5, 2).astype(np.float32)
-        det_model.detect.return_value = (np.concatenate([bbox, score], axis=-1), kpss)
-        face_recognizer.det_model = det_model
+        det_model.detect.return_value = (np.concatenate([bbox, scores], axis=-1), kpss)
+        face_detector.model = det_model
+
+        faces = face_detector.predict(cv_image)
+
+        assert isinstance(faces, dict)
+        assert isinstance(faces.get("boxes", None), np.ndarray)
+        assert isinstance(faces.get("landmarks", None), np.ndarray)
+        assert isinstance(faces.get("scores", None), np.ndarray)
+        assert np.equal(faces["boxes"], bbox.round()).all()
+        assert np.equal(faces["landmarks"], kpss).all()
+        assert np.equal(faces["scores"], scores).all()
+        det_model.detect.assert_called_once()
+
+    def test_recognition(self, cv_image: cv2.Mat, mocker: MockerFixture) -> None:
+        mocker.patch.object(FaceRecognizer, "load")
+        face_recognizer = FaceRecognizer("buffalo_s", min_score=0.0, cache_dir="test_cache")
+
+        num_faces = 2
+        bbox = np.random.rand(num_faces, 4).astype(np.float32)
+        scores = np.array([0.67] * num_faces).astype(np.float32)
+        kpss = np.random.rand(num_faces, 5, 2).astype(np.float32)
+        faces = {"boxes": bbox, "landmarks": kpss, "scores": scores}
 
         rec_model = mock.Mock()
         embedding = np.random.rand(num_faces, 512).astype(np.float32)
         rec_model.get_feat.return_value = embedding
-        face_recognizer.rec_model = rec_model
+        face_recognizer.model = rec_model
 
-        faces = face_recognizer.predict(cv_image)
+        faces = face_recognizer.predict(cv_image, faces)
 
+        assert isinstance(faces, list)
         assert len(faces) == num_faces
         for face in faces:
-            assert face["imageHeight"] == 800
-            assert face["imageWidth"] == 600
-            assert isinstance(face["embedding"], np.ndarray)
+            assert isinstance(face.get("boundingBox"), dict)
+            assert set(face["boundingBox"]) == {"x1", "y1", "x2", "y2"}
+            assert all(isinstance(val, np.float32) for val in face["boundingBox"].values())
+            assert isinstance(face.get("embedding"), np.ndarray)
             assert face["embedding"].shape[0] == 512
-            assert face["embedding"].dtype == np.float32
+            assert isinstance(face.get("score", None), np.float32)
 
-        det_model.detect.assert_called_once()
-        assert rec_model.get_feat.call_count == num_faces
+        rec_model.get_feat.assert_called_once()
+        call_args = rec_model.get_feat.call_args_list[0].args
+        assert len(call_args) == 1
+        assert isinstance(call_args[0], list)
+        assert isinstance(call_args[0][0], np.ndarray)
+        assert call_args[0][0].shape == (112, 112, 3)
 
 
 @pytest.mark.asyncio
 class TestCache:
     async def test_caches(self, mock_get_model: mock.Mock) -> None:
         model_cache = ModelCache()
-        await model_cache.get("test_model_name", ModelType.FACIAL_RECOGNITION)
-        await model_cache.get("test_model_name", ModelType.FACIAL_RECOGNITION)
+        await model_cache.get("test_model_name", ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION)
+        await model_cache.get("test_model_name", ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION)
         assert len(model_cache.cache._cache) == 1
         mock_get_model.assert_called_once()
 
     async def test_kwargs_used(self, mock_get_model: mock.Mock) -> None:
         model_cache = ModelCache()
-        await model_cache.get("test_model_name", ModelType.FACIAL_RECOGNITION, cache_dir="test_cache")
-        mock_get_model.assert_called_once_with(ModelType.FACIAL_RECOGNITION, "test_model_name", cache_dir="test_cache")
+        await model_cache.get(
+            "test_model_name", ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION, cache_dir="test_cache"
+        )
+        mock_get_model.assert_called_once_with(
+            "test_model_name", ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION, cache_dir="test_cache"
+        )
 
     async def test_different_clip(self, mock_get_model: mock.Mock) -> None:
         model_cache = ModelCache()
-        await model_cache.get("test_image_model_name", ModelType.CLIP)
-        await model_cache.get("test_text_model_name", ModelType.CLIP)
+        await model_cache.get("test_model_name", ModelType.VISUAL, ModelTask.SEARCH)
+        await model_cache.get("test_model_name", ModelType.TEXTUAL, ModelTask.SEARCH)
         mock_get_model.assert_has_calls(
             [
-                mock.call(ModelType.CLIP, "test_image_model_name"),
-                mock.call(ModelType.CLIP, "test_text_model_name"),
+                mock.call("test_model_name", ModelType.VISUAL, ModelTask.SEARCH),
+                mock.call("test_model_name", ModelType.TEXTUAL, ModelTask.SEARCH),
             ]
         )
         assert len(model_cache.cache._cache) == 2
@@ -490,19 +520,19 @@ class TestCache:
     @mock.patch("app.models.cache.OptimisticLock", autospec=True)
     async def test_model_ttl(self, mock_lock_cls: mock.Mock, mock_get_model: mock.Mock) -> None:
         model_cache = ModelCache()
-        await model_cache.get("test_model_name", ModelType.FACIAL_RECOGNITION, ttl=100)
+        await model_cache.get("test_model_name", ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION, ttl=100)
         mock_lock_cls.return_value.__aenter__.return_value.cas.assert_called_with(mock.ANY, ttl=100)
 
     @mock.patch("app.models.cache.SimpleMemoryCache.expire")
     async def test_revalidate_get(self, mock_cache_expire: mock.Mock, mock_get_model: mock.Mock) -> None:
         model_cache = ModelCache(revalidate=True)
-        await model_cache.get("test_model_name", ModelType.FACIAL_RECOGNITION, ttl=100)
-        await model_cache.get("test_model_name", ModelType.FACIAL_RECOGNITION, ttl=100)
+        await model_cache.get("test_model_name", ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION, ttl=100)
+        await model_cache.get("test_model_name", ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION, ttl=100)
         mock_cache_expire.assert_called_once_with(mock.ANY, 100)
 
     async def test_profiling(self, mock_get_model: mock.Mock) -> None:
         model_cache = ModelCache(profiling=True)
-        await model_cache.get("test_model_name", ModelType.FACIAL_RECOGNITION, ttl=100)
+        await model_cache.get("test_model_name", ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION, ttl=100)
         profiling = await model_cache.get_profiling()
         assert isinstance(profiling, dict)
         assert profiling == model_cache.cache.profiling
@@ -510,9 +540,9 @@ class TestCache:
     async def test_loads_mclip(self) -> None:
         model_cache = ModelCache()
 
-        model = await model_cache.get("XLM-Roberta-Large-Vit-B-32", ModelType.CLIP, mode="text")
+        model = await model_cache.get("XLM-Roberta-Large-Vit-B-32", ModelType.TEXTUAL, ModelTask.SEARCH)
 
-        assert isinstance(model, MCLIPEncoder)
+        assert isinstance(model, MClipTextualEncoder)
         assert model.model_name == "XLM-Roberta-Large-Vit-B-32"
 
     async def test_raises_exception_if_invalid_model_type(self) -> None:
@@ -520,15 +550,55 @@ class TestCache:
         model_cache = ModelCache()
 
         with pytest.raises(ValueError):
-            await model_cache.get("XLM-Roberta-Large-Vit-B-32", invalid, mode="text")
+            await model_cache.get("XLM-Roberta-Large-Vit-B-32", ModelType.TEXTUAL, invalid)
 
     async def test_raises_exception_if_unknown_model_name(self) -> None:
         model_cache = ModelCache()
 
         with pytest.raises(ValueError):
-            await model_cache.get("test_model_name", ModelType.CLIP, mode="text")
+            await model_cache.get("test_model_name", ModelType.TEXTUAL, ModelTask.SEARCH)
 
-    async def test_preloads_models(self, monkeypatch: MonkeyPatch, mock_get_model: mock.Mock) -> None:
+    async def test_preloads_clip_models(self, monkeypatch: MonkeyPatch, mock_get_model: mock.Mock) -> None:
+        os.environ["MACHINE_LEARNING_PRELOAD__CLIP"] = "ViT-B-32__openai"
+
+        settings = Settings()
+        assert settings.preload is not None
+        assert settings.preload.clip == "ViT-B-32__openai"
+
+        model_cache = ModelCache()
+        monkeypatch.setattr("app.main.model_cache", model_cache)
+
+        await preload_models(settings.preload)
+        mock_get_model.assert_has_calls(
+            [
+                mock.call("ViT-B-32__openai", ModelType.TEXTUAL, ModelTask.SEARCH),
+                mock.call("ViT-B-32__openai", ModelType.VISUAL, ModelTask.SEARCH),
+            ],
+            any_order=True,
+        )
+
+    async def test_preloads_facial_recognition_models(
+        self, monkeypatch: MonkeyPatch, mock_get_model: mock.Mock
+    ) -> None:
+        os.environ["MACHINE_LEARNING_PRELOAD__FACIAL_RECOGNITION"] = "buffalo_s"
+
+        settings = Settings()
+        assert settings.preload is not None
+        assert settings.preload.facial_recognition == "buffalo_s"
+
+        model_cache = ModelCache()
+        monkeypatch.setattr("app.main.model_cache", model_cache)
+
+        await preload_models(settings.preload)
+        mock_get_model.assert_has_calls(
+            [
+                mock.call("buffalo_s", ModelType.DETECTION, ModelTask.FACIAL_RECOGNITION),
+                mock.call("buffalo_s", ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION),
+            ],
+            any_order=True,
+        )
+
+    async def test_preloads_all_models(self, monkeypatch: MonkeyPatch, mock_get_model: mock.Mock) -> None:
         os.environ["MACHINE_LEARNING_PRELOAD__CLIP"] = "ViT-B-32__openai"
         os.environ["MACHINE_LEARNING_PRELOAD__FACIAL_RECOGNITION"] = "buffalo_s"
 
@@ -541,11 +611,15 @@ class TestCache:
         monkeypatch.setattr("app.main.model_cache", model_cache)
 
         await preload_models(settings.preload)
-        assert len(model_cache.cache._cache) == 2
-        assert mock_get_model.call_count == 2
-        await model_cache.get("ViT-B-32__openai", ModelType.CLIP, ttl=100)
-        await model_cache.get("buffalo_s", ModelType.FACIAL_RECOGNITION, ttl=100)
-        assert mock_get_model.call_count == 2
+        mock_get_model.assert_has_calls(
+            [
+                mock.call("ViT-B-32__openai", ModelType.TEXTUAL, ModelTask.SEARCH),
+                mock.call("ViT-B-32__openai", ModelType.VISUAL, ModelTask.SEARCH),
+                mock.call("buffalo_s", ModelType.DETECTION, ModelTask.FACIAL_RECOGNITION),
+                mock.call("buffalo_s", ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION),
+            ],
+            any_order=True,
+        )
 
 
 @pytest.mark.asyncio
@@ -572,7 +646,8 @@ class TestLoad:
     async def test_load_clears_cache_and_retries_if_os_error(self) -> None:
         mock_model = mock.Mock(spec=InferenceModel)
         mock_model.model_name = "test_model_name"
-        mock_model.model_type = ModelType.CLIP
+        mock_model.model_type = ModelType.VISUAL
+        mock_model.model_task = ModelTask.SEARCH
         mock_model.load.side_effect = [OSError, None]
         mock_model.loaded = False
 
@@ -597,13 +672,15 @@ class TestEndpoints:
 
         response = deployed_app.post(
             "http://localhost:3003/predict",
-            data={"modelName": "ViT-B-32__openai", "modelType": "clip", "options": json.dumps({"mode": "vision"})},
+            data={"entries": json.dumps({"clip": {"visual": {"modelName": "ViT-B-32__openai"}}})},
             files={"image": byte_image.getvalue()},
         )
 
         actual = response.json()
         assert response.status_code == 200
-        assert np.allclose(expected, actual)
+        assert isinstance(actual, dict)
+        assert isinstance(actual.get("clip", None), list)
+        assert np.allclose(expected, actual["clip"])
 
     def test_clip_text_endpoint(self, responses: dict[str, Any], deployed_app: TestClient) -> None:
         expected = responses["clip"]["text"]
@@ -611,38 +688,49 @@ class TestEndpoints:
         response = deployed_app.post(
             "http://localhost:3003/predict",
             data={
-                "modelName": "ViT-B-32__openai",
-                "modelType": "clip",
+                "entries": json.dumps(
+                    {
+                        "clip": {"textual": {"modelName": "ViT-B-32__openai"}},
+                    },
+                ),
                 "text": "test search query",
-                "options": json.dumps({"mode": "text"}),
             },
         )
 
         actual = response.json()
         assert response.status_code == 200
-        assert np.allclose(expected, actual)
+        assert isinstance(actual, dict)
+        assert isinstance(actual.get("clip", None), list)
+        assert np.allclose(expected, actual["clip"])
 
     def test_face_endpoint(self, pil_image: Image.Image, responses: dict[str, Any], deployed_app: TestClient) -> None:
         byte_image = BytesIO()
         pil_image.save(byte_image, format="jpeg")
-        expected = responses["facial-recognition"]
 
         response = deployed_app.post(
             "http://localhost:3003/predict",
             data={
-                "modelName": "buffalo_l",
-                "modelType": "facial-recognition",
-                "options": json.dumps({"minScore": 0.034}),
+                "entries": json.dumps(
+                    {
+                        "facial-recognition": {
+                            "detection": {"modelName": "buffalo_l", "options": {"minScore": 0.034}},
+                            "recognition": {"modelName": "buffalo_l"},
+                        }
+                    }
+                )
             },
             files={"image": byte_image.getvalue()},
         )
 
         actual = response.json()
         assert response.status_code == 200
-        assert len(expected) == len(actual)
-        for expected_face, actual_face in zip(expected, actual):
-            assert expected_face["imageHeight"] == actual_face["imageHeight"]
-            assert expected_face["imageWidth"] == actual_face["imageWidth"]
+        assert isinstance(actual, dict)
+        assert actual.get("imageHeight", None) == responses["imageHeight"]
+        assert actual.get("imageWidth", None) == responses["imageWidth"]
+        assert "facial-recognition" in actual and isinstance(actual["facial-recognition"], list)
+        assert len(actual["facial-recognition"]) == len(responses["facial-recognition"])
+
+        for expected_face, actual_face in zip(responses["facial-recognition"], actual["facial-recognition"]):
             assert expected_face["boundingBox"] == actual_face["boundingBox"]
             assert np.allclose(expected_face["embedding"], actual_face["embedding"])
             assert np.allclose(expected_face["score"], actual_face["score"])
diff --git a/machine-learning/locustfile.py b/machine-learning/locustfile.py
index d32bc17d55..81087bee8c 100644
--- a/machine-learning/locustfile.py
+++ b/machine-learning/locustfile.py
@@ -37,7 +37,6 @@ def on_test_start(environment: Environment, **kwargs: Any) -> None:
     global byte_image
     assert environment.parsed_options is not None
     image = Image.new("RGB", (environment.parsed_options.image_size, environment.parsed_options.image_size))
-    byte_image = BytesIO()
     image.save(byte_image, format="jpeg")
 
 
@@ -45,34 +44,25 @@ class InferenceLoadTest(HttpUser):
     abstract: bool = True
     host = "http://127.0.0.1:3003"
     data: bytes
-    headers: dict[str, str] = {"Content-Type": "image/jpg"}
 
     # re-use the image across all instances in a process
     def on_start(self) -> None:
-        global byte_image
         self.data = byte_image.getvalue()
 
 
 class CLIPTextFormDataLoadTest(InferenceLoadTest):
     @task
     def encode_text(self) -> None:
-        data = [
-            ("modelName", self.environment.parsed_options.clip_model),
-            ("modelType", "clip"),
-            ("options", json.dumps({"mode": "text"})),
-            ("text", "test search query"),
-        ]
+        request = {"clip": {"textual": {"modelName": self.environment.parsed_options.clip_model}}}
+        data = [("entries", json.dumps(request)), ("text", "test search query")]
         self.client.post("/predict", data=data)
 
 
 class CLIPVisionFormDataLoadTest(InferenceLoadTest):
     @task
     def encode_image(self) -> None:
-        data = [
-            ("modelName", self.environment.parsed_options.clip_model),
-            ("modelType", "clip"),
-            ("options", json.dumps({"mode": "vision"})),
-        ]
+        request = {"clip": {"visual": {"modelName": self.environment.parsed_options.clip_model, "options": {}}}}
+        data = [("entries", json.dumps(request))]
         files = {"image": self.data}
         self.client.post("/predict", data=data, files=files)
 
@@ -80,11 +70,18 @@ class CLIPVisionFormDataLoadTest(InferenceLoadTest):
 class RecognitionFormDataLoadTest(InferenceLoadTest):
     @task
     def recognize(self) -> None:
-        data = [
-            ("modelName", self.environment.parsed_options.face_model),
-            ("modelType", "facial-recognition"),
-            ("options", json.dumps({"minScore": self.environment.parsed_options.face_min_score})),
-        ]
+        request = {
+            "facial-recognition": {
+                "recognition": {
+                    "modelName": self.environment.parsed_options.face_model,
+                    "options": {"minScore": self.environment.parsed_options.face_min_score},
+                },
+                "detection": {
+                    "modelName": self.environment.parsed_options.face_model,
+                },
+            }
+        }
+        data = [("entries", json.dumps(request))]
         files = {"image": self.data}
 
         self.client.post("/predict", data=data, files=files)
diff --git a/machine-learning/responses.json b/machine-learning/responses.json
index 209f05cffb..8fe9cd61cc 100644
--- a/machine-learning/responses.json
+++ b/machine-learning/responses.json
@@ -213,8 +213,6 @@
   },
   "facial-recognition": [
     {
-      "imageWidth": 600,
-      "imageHeight": 800,
       "boundingBox": {
         "x1": 690.0,
         "y1": -89.0,
@@ -325,5 +323,7 @@
         -0.077056274, 0.002099529
       ]
     }
-  ]
+  ],
+  "imageWidth": 600,
+  "imageHeight": 800
 }
diff --git a/mobile/openapi/README.md b/mobile/openapi/README.md
index 93ea6a551c..914d291e1e 100644
--- a/mobile/openapi/README.md
+++ b/mobile/openapi/README.md
@@ -279,7 +279,6 @@ Class | Method | HTTP request | Description
  - [BulkIdResponseDto](doc//BulkIdResponseDto.md)
  - [BulkIdsDto](doc//BulkIdsDto.md)
  - [CLIPConfig](doc//CLIPConfig.md)
- - [CLIPMode](doc//CLIPMode.md)
  - [CQMode](doc//CQMode.md)
  - [ChangePasswordDto](doc//ChangePasswordDto.md)
  - [CheckExistingAssetsDto](doc//CheckExistingAssetsDto.md)
@@ -299,6 +298,7 @@ Class | Method | HTTP request | Description
  - [EntityType](doc//EntityType.md)
  - [ExifResponseDto](doc//ExifResponseDto.md)
  - [FaceDto](doc//FaceDto.md)
+ - [FacialRecognitionConfig](doc//FacialRecognitionConfig.md)
  - [FileChecksumDto](doc//FileChecksumDto.md)
  - [FileChecksumResponseDto](doc//FileChecksumResponseDto.md)
  - [FileReportDto](doc//FileReportDto.md)
@@ -328,7 +328,6 @@ Class | Method | HTTP request | Description
  - [MemoryUpdateDto](doc//MemoryUpdateDto.md)
  - [MergePersonDto](doc//MergePersonDto.md)
  - [MetadataSearchDto](doc//MetadataSearchDto.md)
- - [ModelType](doc//ModelType.md)
  - [OAuthAuthorizeResponseDto](doc//OAuthAuthorizeResponseDto.md)
  - [OAuthCallbackDto](doc//OAuthCallbackDto.md)
  - [OAuthConfigDto](doc//OAuthConfigDto.md)
@@ -348,7 +347,6 @@ Class | Method | HTTP request | Description
  - [QueueStatusDto](doc//QueueStatusDto.md)
  - [ReactionLevel](doc//ReactionLevel.md)
  - [ReactionType](doc//ReactionType.md)
- - [RecognitionConfig](doc//RecognitionConfig.md)
  - [ReverseGeocodingStateResponseDto](doc//ReverseGeocodingStateResponseDto.md)
  - [ScanLibraryDto](doc//ScanLibraryDto.md)
  - [SearchAlbumResponseDto](doc//SearchAlbumResponseDto.md)
diff --git a/mobile/openapi/lib/api.dart b/mobile/openapi/lib/api.dart
index 17602fafaf..4d7a4a533d 100644
--- a/mobile/openapi/lib/api.dart
+++ b/mobile/openapi/lib/api.dart
@@ -106,7 +106,6 @@ part 'model/avatar_update.dart';
 part 'model/bulk_id_response_dto.dart';
 part 'model/bulk_ids_dto.dart';
 part 'model/clip_config.dart';
-part 'model/clip_mode.dart';
 part 'model/cq_mode.dart';
 part 'model/change_password_dto.dart';
 part 'model/check_existing_assets_dto.dart';
@@ -126,6 +125,7 @@ part 'model/email_notifications_update.dart';
 part 'model/entity_type.dart';
 part 'model/exif_response_dto.dart';
 part 'model/face_dto.dart';
+part 'model/facial_recognition_config.dart';
 part 'model/file_checksum_dto.dart';
 part 'model/file_checksum_response_dto.dart';
 part 'model/file_report_dto.dart';
@@ -155,7 +155,6 @@ part 'model/memory_update.dart';
 part 'model/memory_update_dto.dart';
 part 'model/merge_person_dto.dart';
 part 'model/metadata_search_dto.dart';
-part 'model/model_type.dart';
 part 'model/o_auth_authorize_response_dto.dart';
 part 'model/o_auth_callback_dto.dart';
 part 'model/o_auth_config_dto.dart';
@@ -175,7 +174,6 @@ part 'model/places_response_dto.dart';
 part 'model/queue_status_dto.dart';
 part 'model/reaction_level.dart';
 part 'model/reaction_type.dart';
-part 'model/recognition_config.dart';
 part 'model/reverse_geocoding_state_response_dto.dart';
 part 'model/scan_library_dto.dart';
 part 'model/search_album_response_dto.dart';
diff --git a/mobile/openapi/lib/api_client.dart b/mobile/openapi/lib/api_client.dart
index 94187719a8..dbfec53b93 100644
--- a/mobile/openapi/lib/api_client.dart
+++ b/mobile/openapi/lib/api_client.dart
@@ -276,8 +276,6 @@ class ApiClient {
           return BulkIdsDto.fromJson(value);
         case 'CLIPConfig':
           return CLIPConfig.fromJson(value);
-        case 'CLIPMode':
-          return CLIPModeTypeTransformer().decode(value);
         case 'CQMode':
           return CQModeTypeTransformer().decode(value);
         case 'ChangePasswordDto':
@@ -316,6 +314,8 @@ class ApiClient {
           return ExifResponseDto.fromJson(value);
         case 'FaceDto':
           return FaceDto.fromJson(value);
+        case 'FacialRecognitionConfig':
+          return FacialRecognitionConfig.fromJson(value);
         case 'FileChecksumDto':
           return FileChecksumDto.fromJson(value);
         case 'FileChecksumResponseDto':
@@ -374,8 +374,6 @@ class ApiClient {
           return MergePersonDto.fromJson(value);
         case 'MetadataSearchDto':
           return MetadataSearchDto.fromJson(value);
-        case 'ModelType':
-          return ModelTypeTypeTransformer().decode(value);
         case 'OAuthAuthorizeResponseDto':
           return OAuthAuthorizeResponseDto.fromJson(value);
         case 'OAuthCallbackDto':
@@ -414,8 +412,6 @@ class ApiClient {
           return ReactionLevelTypeTransformer().decode(value);
         case 'ReactionType':
           return ReactionTypeTypeTransformer().decode(value);
-        case 'RecognitionConfig':
-          return RecognitionConfig.fromJson(value);
         case 'ReverseGeocodingStateResponseDto':
           return ReverseGeocodingStateResponseDto.fromJson(value);
         case 'ScanLibraryDto':
diff --git a/mobile/openapi/lib/api_helper.dart b/mobile/openapi/lib/api_helper.dart
index 6f0acc0976..762b728869 100644
--- a/mobile/openapi/lib/api_helper.dart
+++ b/mobile/openapi/lib/api_helper.dart
@@ -76,9 +76,6 @@ String parameterToString(dynamic value) {
   if (value is AudioCodec) {
     return AudioCodecTypeTransformer().encode(value).toString();
   }
-  if (value is CLIPMode) {
-    return CLIPModeTypeTransformer().encode(value).toString();
-  }
   if (value is CQMode) {
     return CQModeTypeTransformer().encode(value).toString();
   }
@@ -106,9 +103,6 @@ String parameterToString(dynamic value) {
   if (value is MemoryType) {
     return MemoryTypeTypeTransformer().encode(value).toString();
   }
-  if (value is ModelType) {
-    return ModelTypeTypeTransformer().encode(value).toString();
-  }
   if (value is PathEntityType) {
     return PathEntityTypeTypeTransformer().encode(value).toString();
   }
diff --git a/mobile/openapi/lib/model/clip_config.dart b/mobile/openapi/lib/model/clip_config.dart
index 5a1d429aea..6e95c15fbf 100644
--- a/mobile/openapi/lib/model/clip_config.dart
+++ b/mobile/openapi/lib/model/clip_config.dart
@@ -14,63 +14,31 @@ class CLIPConfig {
   /// Returns a new [CLIPConfig] instance.
   CLIPConfig({
     required this.enabled,
-    this.mode,
     required this.modelName,
-    this.modelType,
   });
 
   bool enabled;
 
-  ///
-  /// Please note: This property should have been non-nullable! Since the specification file
-  /// does not include a default value (using the "default:" property), however, the generated
-  /// source code must fall back to having a nullable type.
-  /// Consider adding a "default:" property in the specification file to hide this note.
-  ///
-  CLIPMode? mode;
-
   String modelName;
 
-  ///
-  /// Please note: This property should have been non-nullable! Since the specification file
-  /// does not include a default value (using the "default:" property), however, the generated
-  /// source code must fall back to having a nullable type.
-  /// Consider adding a "default:" property in the specification file to hide this note.
-  ///
-  ModelType? modelType;
-
   @override
   bool operator ==(Object other) => identical(this, other) || other is CLIPConfig &&
     other.enabled == enabled &&
-    other.mode == mode &&
-    other.modelName == modelName &&
-    other.modelType == modelType;
+    other.modelName == modelName;
 
   @override
   int get hashCode =>
     // ignore: unnecessary_parenthesis
     (enabled.hashCode) +
-    (mode == null ? 0 : mode!.hashCode) +
-    (modelName.hashCode) +
-    (modelType == null ? 0 : modelType!.hashCode);
+    (modelName.hashCode);
 
   @override
-  String toString() => 'CLIPConfig[enabled=$enabled, mode=$mode, modelName=$modelName, modelType=$modelType]';
+  String toString() => 'CLIPConfig[enabled=$enabled, modelName=$modelName]';
 
   Map<String, dynamic> toJson() {
     final json = <String, dynamic>{};
       json[r'enabled'] = this.enabled;
-    if (this.mode != null) {
-      json[r'mode'] = this.mode;
-    } else {
-    //  json[r'mode'] = null;
-    }
       json[r'modelName'] = this.modelName;
-    if (this.modelType != null) {
-      json[r'modelType'] = this.modelType;
-    } else {
-    //  json[r'modelType'] = null;
-    }
     return json;
   }
 
@@ -83,9 +51,7 @@ class CLIPConfig {
 
       return CLIPConfig(
         enabled: mapValueOfType<bool>(json, r'enabled')!,
-        mode: CLIPMode.fromJson(json[r'mode']),
         modelName: mapValueOfType<String>(json, r'modelName')!,
-        modelType: ModelType.fromJson(json[r'modelType']),
       );
     }
     return null;
diff --git a/mobile/openapi/lib/model/clip_mode.dart b/mobile/openapi/lib/model/clip_mode.dart
deleted file mode 100644
index 98a38759da..0000000000
--- a/mobile/openapi/lib/model/clip_mode.dart
+++ /dev/null
@@ -1,85 +0,0 @@
-//
-// AUTO-GENERATED FILE, DO NOT MODIFY!
-//
-// @dart=2.18
-
-// ignore_for_file: unused_element, unused_import
-// ignore_for_file: always_put_required_named_parameters_first
-// ignore_for_file: constant_identifier_names
-// ignore_for_file: lines_longer_than_80_chars
-
-part of openapi.api;
-
-
-class CLIPMode {
-  /// Instantiate a new enum with the provided [value].
-  const CLIPMode._(this.value);
-
-  /// The underlying value of this enum member.
-  final String value;
-
-  @override
-  String toString() => value;
-
-  String toJson() => value;
-
-  static const vision = CLIPMode._(r'vision');
-  static const text = CLIPMode._(r'text');
-
-  /// List of all possible values in this [enum][CLIPMode].
-  static const values = <CLIPMode>[
-    vision,
-    text,
-  ];
-
-  static CLIPMode? fromJson(dynamic value) => CLIPModeTypeTransformer().decode(value);
-
-  static List<CLIPMode> listFromJson(dynamic json, {bool growable = false,}) {
-    final result = <CLIPMode>[];
-    if (json is List && json.isNotEmpty) {
-      for (final row in json) {
-        final value = CLIPMode.fromJson(row);
-        if (value != null) {
-          result.add(value);
-        }
-      }
-    }
-    return result.toList(growable: growable);
-  }
-}
-
-/// Transformation class that can [encode] an instance of [CLIPMode] to String,
-/// and [decode] dynamic data back to [CLIPMode].
-class CLIPModeTypeTransformer {
-  factory CLIPModeTypeTransformer() => _instance ??= const CLIPModeTypeTransformer._();
-
-  const CLIPModeTypeTransformer._();
-
-  String encode(CLIPMode data) => data.value;
-
-  /// Decodes a [dynamic value][data] to a CLIPMode.
-  ///
-  /// If [allowNull] is true and the [dynamic value][data] cannot be decoded successfully,
-  /// then null is returned. However, if [allowNull] is false and the [dynamic value][data]
-  /// cannot be decoded successfully, then an [UnimplementedError] is thrown.
-  ///
-  /// The [allowNull] is very handy when an API changes and a new enum value is added or removed,
-  /// and users are still using an old app with the old code.
-  CLIPMode? decode(dynamic data, {bool allowNull = true}) {
-    if (data != null) {
-      switch (data) {
-        case r'vision': return CLIPMode.vision;
-        case r'text': return CLIPMode.text;
-        default:
-          if (!allowNull) {
-            throw ArgumentError('Unknown enum value to decode: $data');
-          }
-      }
-    }
-    return null;
-  }
-
-  /// Singleton [CLIPModeTypeTransformer] instance.
-  static CLIPModeTypeTransformer? _instance;
-}
-
diff --git a/mobile/openapi/lib/model/recognition_config.dart b/mobile/openapi/lib/model/facial_recognition_config.dart
similarity index 59%
rename from mobile/openapi/lib/model/recognition_config.dart
rename to mobile/openapi/lib/model/facial_recognition_config.dart
index 3080023bbd..8a3a5ab9c2 100644
--- a/mobile/openapi/lib/model/recognition_config.dart
+++ b/mobile/openapi/lib/model/facial_recognition_config.dart
@@ -10,15 +10,14 @@
 
 part of openapi.api;
 
-class RecognitionConfig {
-  /// Returns a new [RecognitionConfig] instance.
-  RecognitionConfig({
+class FacialRecognitionConfig {
+  /// Returns a new [FacialRecognitionConfig] instance.
+  FacialRecognitionConfig({
     required this.enabled,
     required this.maxDistance,
     required this.minFaces,
     required this.minScore,
     required this.modelName,
-    this.modelType,
   });
 
   bool enabled;
@@ -36,22 +35,13 @@ class RecognitionConfig {
 
   String modelName;
 
-  ///
-  /// Please note: This property should have been non-nullable! Since the specification file
-  /// does not include a default value (using the "default:" property), however, the generated
-  /// source code must fall back to having a nullable type.
-  /// Consider adding a "default:" property in the specification file to hide this note.
-  ///
-  ModelType? modelType;
-
   @override
-  bool operator ==(Object other) => identical(this, other) || other is RecognitionConfig &&
+  bool operator ==(Object other) => identical(this, other) || other is FacialRecognitionConfig &&
     other.enabled == enabled &&
     other.maxDistance == maxDistance &&
     other.minFaces == minFaces &&
     other.minScore == minScore &&
-    other.modelName == modelName &&
-    other.modelType == modelType;
+    other.modelName == modelName;
 
   @override
   int get hashCode =>
@@ -60,11 +50,10 @@ class RecognitionConfig {
     (maxDistance.hashCode) +
     (minFaces.hashCode) +
     (minScore.hashCode) +
-    (modelName.hashCode) +
-    (modelType == null ? 0 : modelType!.hashCode);
+    (modelName.hashCode);
 
   @override
-  String toString() => 'RecognitionConfig[enabled=$enabled, maxDistance=$maxDistance, minFaces=$minFaces, minScore=$minScore, modelName=$modelName, modelType=$modelType]';
+  String toString() => 'FacialRecognitionConfig[enabled=$enabled, maxDistance=$maxDistance, minFaces=$minFaces, minScore=$minScore, modelName=$modelName]';
 
   Map<String, dynamic> toJson() {
     final json = <String, dynamic>{};
@@ -73,38 +62,32 @@ class RecognitionConfig {
       json[r'minFaces'] = this.minFaces;
       json[r'minScore'] = this.minScore;
       json[r'modelName'] = this.modelName;
-    if (this.modelType != null) {
-      json[r'modelType'] = this.modelType;
-    } else {
-    //  json[r'modelType'] = null;
-    }
     return json;
   }
 
-  /// Returns a new [RecognitionConfig] instance and imports its values from
+  /// Returns a new [FacialRecognitionConfig] instance and imports its values from
   /// [value] if it's a [Map], null otherwise.
   // ignore: prefer_constructors_over_static_methods
-  static RecognitionConfig? fromJson(dynamic value) {
+  static FacialRecognitionConfig? fromJson(dynamic value) {
     if (value is Map) {
       final json = value.cast<String, dynamic>();
 
-      return RecognitionConfig(
+      return FacialRecognitionConfig(
         enabled: mapValueOfType<bool>(json, r'enabled')!,
         maxDistance: mapValueOfType<double>(json, r'maxDistance')!,
         minFaces: mapValueOfType<int>(json, r'minFaces')!,
         minScore: mapValueOfType<double>(json, r'minScore')!,
         modelName: mapValueOfType<String>(json, r'modelName')!,
-        modelType: ModelType.fromJson(json[r'modelType']),
       );
     }
     return null;
   }
 
-  static List<RecognitionConfig> listFromJson(dynamic json, {bool growable = false,}) {
-    final result = <RecognitionConfig>[];
+  static List<FacialRecognitionConfig> listFromJson(dynamic json, {bool growable = false,}) {
+    final result = <FacialRecognitionConfig>[];
     if (json is List && json.isNotEmpty) {
       for (final row in json) {
-        final value = RecognitionConfig.fromJson(row);
+        final value = FacialRecognitionConfig.fromJson(row);
         if (value != null) {
           result.add(value);
         }
@@ -113,12 +96,12 @@ class RecognitionConfig {
     return result.toList(growable: growable);
   }
 
-  static Map<String, RecognitionConfig> mapFromJson(dynamic json) {
-    final map = <String, RecognitionConfig>{};
+  static Map<String, FacialRecognitionConfig> mapFromJson(dynamic json) {
+    final map = <String, FacialRecognitionConfig>{};
     if (json is Map && json.isNotEmpty) {
       json = json.cast<String, dynamic>(); // ignore: parameter_assignments
       for (final entry in json.entries) {
-        final value = RecognitionConfig.fromJson(entry.value);
+        final value = FacialRecognitionConfig.fromJson(entry.value);
         if (value != null) {
           map[entry.key] = value;
         }
@@ -127,14 +110,14 @@ class RecognitionConfig {
     return map;
   }
 
-  // maps a json object with a list of RecognitionConfig-objects as value to a dart map
-  static Map<String, List<RecognitionConfig>> mapListFromJson(dynamic json, {bool growable = false,}) {
-    final map = <String, List<RecognitionConfig>>{};
+  // maps a json object with a list of FacialRecognitionConfig-objects as value to a dart map
+  static Map<String, List<FacialRecognitionConfig>> mapListFromJson(dynamic json, {bool growable = false,}) {
+    final map = <String, List<FacialRecognitionConfig>>{};
     if (json is Map && json.isNotEmpty) {
       // ignore: parameter_assignments
       json = json.cast<String, dynamic>();
       for (final entry in json.entries) {
-        map[entry.key] = RecognitionConfig.listFromJson(entry.value, growable: growable,);
+        map[entry.key] = FacialRecognitionConfig.listFromJson(entry.value, growable: growable,);
       }
     }
     return map;
diff --git a/mobile/openapi/lib/model/model_type.dart b/mobile/openapi/lib/model/model_type.dart
deleted file mode 100644
index 36f7424199..0000000000
--- a/mobile/openapi/lib/model/model_type.dart
+++ /dev/null
@@ -1,85 +0,0 @@
-//
-// AUTO-GENERATED FILE, DO NOT MODIFY!
-//
-// @dart=2.18
-
-// ignore_for_file: unused_element, unused_import
-// ignore_for_file: always_put_required_named_parameters_first
-// ignore_for_file: constant_identifier_names
-// ignore_for_file: lines_longer_than_80_chars
-
-part of openapi.api;
-
-
-class ModelType {
-  /// Instantiate a new enum with the provided [value].
-  const ModelType._(this.value);
-
-  /// The underlying value of this enum member.
-  final String value;
-
-  @override
-  String toString() => value;
-
-  String toJson() => value;
-
-  static const facialRecognition = ModelType._(r'facial-recognition');
-  static const clip = ModelType._(r'clip');
-
-  /// List of all possible values in this [enum][ModelType].
-  static const values = <ModelType>[
-    facialRecognition,
-    clip,
-  ];
-
-  static ModelType? fromJson(dynamic value) => ModelTypeTypeTransformer().decode(value);
-
-  static List<ModelType> listFromJson(dynamic json, {bool growable = false,}) {
-    final result = <ModelType>[];
-    if (json is List && json.isNotEmpty) {
-      for (final row in json) {
-        final value = ModelType.fromJson(row);
-        if (value != null) {
-          result.add(value);
-        }
-      }
-    }
-    return result.toList(growable: growable);
-  }
-}
-
-/// Transformation class that can [encode] an instance of [ModelType] to String,
-/// and [decode] dynamic data back to [ModelType].
-class ModelTypeTypeTransformer {
-  factory ModelTypeTypeTransformer() => _instance ??= const ModelTypeTypeTransformer._();
-
-  const ModelTypeTypeTransformer._();
-
-  String encode(ModelType data) => data.value;
-
-  /// Decodes a [dynamic value][data] to a ModelType.
-  ///
-  /// If [allowNull] is true and the [dynamic value][data] cannot be decoded successfully,
-  /// then null is returned. However, if [allowNull] is false and the [dynamic value][data]
-  /// cannot be decoded successfully, then an [UnimplementedError] is thrown.
-  ///
-  /// The [allowNull] is very handy when an API changes and a new enum value is added or removed,
-  /// and users are still using an old app with the old code.
-  ModelType? decode(dynamic data, {bool allowNull = true}) {
-    if (data != null) {
-      switch (data) {
-        case r'facial-recognition': return ModelType.facialRecognition;
-        case r'clip': return ModelType.clip;
-        default:
-          if (!allowNull) {
-            throw ArgumentError('Unknown enum value to decode: $data');
-          }
-      }
-    }
-    return null;
-  }
-
-  /// Singleton [ModelTypeTypeTransformer] instance.
-  static ModelTypeTypeTransformer? _instance;
-}
-
diff --git a/mobile/openapi/lib/model/system_config_machine_learning_dto.dart b/mobile/openapi/lib/model/system_config_machine_learning_dto.dart
index fc3aae1804..3923bacad4 100644
--- a/mobile/openapi/lib/model/system_config_machine_learning_dto.dart
+++ b/mobile/openapi/lib/model/system_config_machine_learning_dto.dart
@@ -26,7 +26,7 @@ class SystemConfigMachineLearningDto {
 
   bool enabled;
 
-  RecognitionConfig facialRecognition;
+  FacialRecognitionConfig facialRecognition;
 
   String url;
 
@@ -71,7 +71,7 @@ class SystemConfigMachineLearningDto {
         clip: CLIPConfig.fromJson(json[r'clip'])!,
         duplicateDetection: DuplicateDetectionConfig.fromJson(json[r'duplicateDetection'])!,
         enabled: mapValueOfType<bool>(json, r'enabled')!,
-        facialRecognition: RecognitionConfig.fromJson(json[r'facialRecognition'])!,
+        facialRecognition: FacialRecognitionConfig.fromJson(json[r'facialRecognition'])!,
         url: mapValueOfType<String>(json, r'url')!,
       );
     }
diff --git a/open-api/immich-openapi-specs.json b/open-api/immich-openapi-specs.json
index 94feb6cfcf..c53f30527c 100644
--- a/open-api/immich-openapi-specs.json
+++ b/open-api/immich-openapi-specs.json
@@ -7878,14 +7878,8 @@
           "enabled": {
             "type": "boolean"
           },
-          "mode": {
-            "$ref": "#/components/schemas/CLIPMode"
-          },
           "modelName": {
             "type": "string"
-          },
-          "modelType": {
-            "$ref": "#/components/schemas/ModelType"
           }
         },
         "required": [
@@ -7894,13 +7888,6 @@
         ],
         "type": "object"
       },
-      "CLIPMode": {
-        "enum": [
-          "vision",
-          "text"
-        ],
-        "type": "string"
-      },
       "CQMode": {
         "enum": [
           "auto",
@@ -8323,6 +8310,40 @@
         ],
         "type": "object"
       },
+      "FacialRecognitionConfig": {
+        "properties": {
+          "enabled": {
+            "type": "boolean"
+          },
+          "maxDistance": {
+            "format": "float",
+            "maximum": 2,
+            "minimum": 0,
+            "type": "number"
+          },
+          "minFaces": {
+            "minimum": 1,
+            "type": "integer"
+          },
+          "minScore": {
+            "format": "float",
+            "maximum": 1,
+            "minimum": 0,
+            "type": "number"
+          },
+          "modelName": {
+            "type": "string"
+          }
+        },
+        "required": [
+          "enabled",
+          "maxDistance",
+          "minFaces",
+          "minScore",
+          "modelName"
+        ],
+        "type": "object"
+      },
       "FileChecksumDto": {
         "properties": {
           "filenames": {
@@ -9039,13 +9060,6 @@
         },
         "type": "object"
       },
-      "ModelType": {
-        "enum": [
-          "facial-recognition",
-          "clip"
-        ],
-        "type": "string"
-      },
       "OAuthAuthorizeResponseDto": {
         "properties": {
           "url": {
@@ -9379,43 +9393,6 @@
         ],
         "type": "string"
       },
-      "RecognitionConfig": {
-        "properties": {
-          "enabled": {
-            "type": "boolean"
-          },
-          "maxDistance": {
-            "format": "float",
-            "maximum": 2,
-            "minimum": 0,
-            "type": "number"
-          },
-          "minFaces": {
-            "minimum": 1,
-            "type": "integer"
-          },
-          "minScore": {
-            "format": "float",
-            "maximum": 1,
-            "minimum": 0,
-            "type": "number"
-          },
-          "modelName": {
-            "type": "string"
-          },
-          "modelType": {
-            "$ref": "#/components/schemas/ModelType"
-          }
-        },
-        "required": [
-          "enabled",
-          "maxDistance",
-          "minFaces",
-          "minScore",
-          "modelName"
-        ],
-        "type": "object"
-      },
       "ReverseGeocodingStateResponseDto": {
         "properties": {
           "lastImportFileName": {
@@ -10521,7 +10498,7 @@
             "type": "boolean"
           },
           "facialRecognition": {
-            "$ref": "#/components/schemas/RecognitionConfig"
+            "$ref": "#/components/schemas/FacialRecognitionConfig"
           },
           "url": {
             "type": "string"
diff --git a/open-api/typescript-sdk/src/fetch-client.ts b/open-api/typescript-sdk/src/fetch-client.ts
index c604de9f8a..d294110be1 100644
--- a/open-api/typescript-sdk/src/fetch-client.ts
+++ b/open-api/typescript-sdk/src/fetch-client.ts
@@ -962,27 +962,24 @@ export type SystemConfigLoggingDto = {
 };
 export type ClipConfig = {
     enabled: boolean;
-    mode?: CLIPMode;
     modelName: string;
-    modelType?: ModelType;
 };
 export type DuplicateDetectionConfig = {
     enabled: boolean;
     maxDistance: number;
 };
-export type RecognitionConfig = {
+export type FacialRecognitionConfig = {
     enabled: boolean;
     maxDistance: number;
     minFaces: number;
     minScore: number;
     modelName: string;
-    modelType?: ModelType;
 };
 export type SystemConfigMachineLearningDto = {
     clip: ClipConfig;
     duplicateDetection: DuplicateDetectionConfig;
     enabled: boolean;
-    facialRecognition: RecognitionConfig;
+    facialRecognition: FacialRecognitionConfig;
     url: string;
 };
 export type SystemConfigMapDto = {
@@ -3074,14 +3071,6 @@ export enum LogLevel {
     Error = "error",
     Fatal = "fatal"
 }
-export enum CLIPMode {
-    Vision = "vision",
-    Text = "text"
-}
-export enum ModelType {
-    FacialRecognition = "facial-recognition",
-    Clip = "clip"
-}
 export enum TimeBucketSize {
     Day = "DAY",
     Month = "MONTH"
diff --git a/server/src/dtos/model-config.dto.ts b/server/src/dtos/model-config.dto.ts
index a3efd19f82..5dc8237414 100644
--- a/server/src/dtos/model-config.dto.ts
+++ b/server/src/dtos/model-config.dto.ts
@@ -1,8 +1,7 @@
 import { ApiProperty } from '@nestjs/swagger';
 import { Type } from 'class-transformer';
-import { IsEnum, IsNotEmpty, IsNumber, IsString, Max, Min } from 'class-validator';
-import { CLIPMode, ModelType } from 'src/interfaces/machine-learning.interface';
-import { Optional, ValidateBoolean } from 'src/validation';
+import { IsNotEmpty, IsNumber, IsString, Max, Min } from 'class-validator';
+import { ValidateBoolean } from 'src/validation';
 
 export class TaskConfig {
   @ValidateBoolean()
@@ -13,19 +12,9 @@ export class ModelConfig extends TaskConfig {
   @IsString()
   @IsNotEmpty()
   modelName!: string;
-
-  @IsEnum(ModelType)
-  @Optional()
-  @ApiProperty({ enumName: 'ModelType', enum: ModelType })
-  modelType?: ModelType;
 }
 
-export class CLIPConfig extends ModelConfig {
-  @IsEnum(CLIPMode)
-  @Optional()
-  @ApiProperty({ enumName: 'CLIPMode', enum: CLIPMode })
-  mode?: CLIPMode;
-}
+export class CLIPConfig extends ModelConfig {}
 
 export class DuplicateDetectionConfig extends TaskConfig {
   @IsNumber()
@@ -36,7 +25,7 @@ export class DuplicateDetectionConfig extends TaskConfig {
   maxDistance!: number;
 }
 
-export class RecognitionConfig extends ModelConfig {
+export class FacialRecognitionConfig extends ModelConfig {
   @IsNumber()
   @Min(0)
   @Max(1)
diff --git a/server/src/dtos/system-config.dto.ts b/server/src/dtos/system-config.dto.ts
index 4dcea6ec3d..b593c0edbf 100644
--- a/server/src/dtos/system-config.dto.ts
+++ b/server/src/dtos/system-config.dto.ts
@@ -30,7 +30,7 @@ import {
   TranscodePolicy,
   VideoCodec,
 } from 'src/config';
-import { CLIPConfig, DuplicateDetectionConfig, RecognitionConfig } from 'src/dtos/model-config.dto';
+import { CLIPConfig, DuplicateDetectionConfig, FacialRecognitionConfig } from 'src/dtos/model-config.dto';
 import { ConcurrentQueueName, QueueName } from 'src/interfaces/job.interface';
 import { ValidateBoolean, validateCronExpression } from 'src/validation';
 
@@ -270,10 +270,10 @@ class SystemConfigMachineLearningDto {
   @IsObject()
   duplicateDetection!: DuplicateDetectionConfig;
 
-  @Type(() => RecognitionConfig)
+  @Type(() => FacialRecognitionConfig)
   @ValidateNested()
   @IsObject()
-  facialRecognition!: RecognitionConfig;
+  facialRecognition!: FacialRecognitionConfig;
 }
 
 enum MapTheme {
diff --git a/server/src/interfaces/machine-learning.interface.ts b/server/src/interfaces/machine-learning.interface.ts
index 0aeed7635a..143281c23a 100644
--- a/server/src/interfaces/machine-learning.interface.ts
+++ b/server/src/interfaces/machine-learning.interface.ts
@@ -1,15 +1,5 @@
-import { CLIPConfig, RecognitionConfig } from 'src/dtos/model-config.dto';
-
 export const IMachineLearningRepository = 'IMachineLearningRepository';
 
-export interface VisionModelInput {
-  imagePath: string;
-}
-
-export interface TextModelInput {
-  text: string;
-}
-
 export interface BoundingBox {
   x1: number;
   y1: number;
@@ -17,26 +7,51 @@ export interface BoundingBox {
   y2: number;
 }
 
-export interface DetectFaceResult {
-  imageWidth: number;
-  imageHeight: number;
-  boundingBox: BoundingBox;
-  score: number;
-  embedding: number[];
+export enum ModelTask {
+  FACIAL_RECOGNITION = 'facial-recognition',
+  SEARCH = 'clip',
 }
 
 export enum ModelType {
-  FACIAL_RECOGNITION = 'facial-recognition',
-  CLIP = 'clip',
+  DETECTION = 'detection',
+  PIPELINE = 'pipeline',
+  RECOGNITION = 'recognition',
+  TEXTUAL = 'textual',
+  VISUAL = 'visual',
 }
 
-export enum CLIPMode {
-  VISION = 'vision',
-  TEXT = 'text',
+export type ModelPayload = { imagePath: string } | { text: string };
+
+type ModelOptions = { modelName: string };
+
+export type FaceDetectionOptions = ModelOptions & { minScore: number };
+
+type VisualResponse = { imageHeight: number; imageWidth: number };
+export type ClipVisualRequest = { [ModelTask.SEARCH]: { [ModelType.VISUAL]: ModelOptions } };
+export type ClipVisualResponse = { [ModelTask.SEARCH]: number[] } & VisualResponse;
+
+export type ClipTextualRequest = { [ModelTask.SEARCH]: { [ModelType.TEXTUAL]: ModelOptions } };
+export type ClipTextualResponse = { [ModelTask.SEARCH]: number[] };
+
+export type FacialRecognitionRequest = {
+  [ModelTask.FACIAL_RECOGNITION]: {
+    [ModelType.DETECTION]: FaceDetectionOptions;
+    [ModelType.RECOGNITION]: ModelOptions;
+  };
+};
+
+export interface Face {
+  boundingBox: BoundingBox;
+  embedding: number[];
+  score: number;
 }
 
+export type FacialRecognitionResponse = { [ModelTask.FACIAL_RECOGNITION]: Face[] } & VisualResponse;
+export type DetectedFaces = { faces: Face[] } & VisualResponse;
+export type MachineLearningRequest = ClipVisualRequest | ClipTextualRequest | FacialRecognitionRequest;
+
 export interface IMachineLearningRepository {
-  encodeImage(url: string, input: VisionModelInput, config: CLIPConfig): Promise<number[]>;
-  encodeText(url: string, input: TextModelInput, config: CLIPConfig): Promise<number[]>;
-  detectFaces(url: string, input: VisionModelInput, config: RecognitionConfig): Promise<DetectFaceResult[]>;
+  encodeImage(url: string, imagePath: string, config: ModelOptions): Promise<number[]>;
+  encodeText(url: string, text: string, config: ModelOptions): Promise<number[]>;
+  detectFaces(url: string, imagePath: string, config: FaceDetectionOptions): Promise<DetectedFaces>;
 }
diff --git a/server/src/interfaces/search.interface.ts b/server/src/interfaces/search.interface.ts
index ce9e2a1940..d5382a04fa 100644
--- a/server/src/interfaces/search.interface.ts
+++ b/server/src/interfaces/search.interface.ts
@@ -37,8 +37,6 @@ export interface SearchExploreItem<T> {
   items: SearchExploreItemSet<T>;
 }
 
-export type Embedding = number[];
-
 export interface SearchAssetIDOptions {
   checksum?: Buffer;
   deviceAssetId?: string;
@@ -106,7 +104,7 @@ export interface SearchExifOptions {
 }
 
 export interface SearchEmbeddingOptions {
-  embedding: Embedding;
+  embedding: number[];
   userIds: string[];
 }
 
@@ -154,7 +152,7 @@ export interface FaceEmbeddingSearch extends SearchEmbeddingOptions {
 
 export interface AssetDuplicateSearch {
   assetId: string;
-  embedding: Embedding;
+  embedding: number[];
   maxDistance?: number;
   type: AssetType;
   userIds: string[];
diff --git a/server/src/repositories/machine-learning.repository.ts b/server/src/repositories/machine-learning.repository.ts
index bff22b9507..405e5a421d 100644
--- a/server/src/repositories/machine-learning.repository.ts
+++ b/server/src/repositories/machine-learning.repository.ts
@@ -1,13 +1,16 @@
 import { Injectable } from '@nestjs/common';
 import { readFile } from 'node:fs/promises';
-import { CLIPConfig, ModelConfig, RecognitionConfig } from 'src/dtos/model-config.dto';
+import { CLIPConfig } from 'src/dtos/model-config.dto';
 import {
-  CLIPMode,
-  DetectFaceResult,
+  ClipTextualResponse,
+  ClipVisualResponse,
+  FaceDetectionOptions,
+  FacialRecognitionResponse,
   IMachineLearningRepository,
+  MachineLearningRequest,
+  ModelPayload,
+  ModelTask,
   ModelType,
-  TextModelInput,
-  VisionModelInput,
 } from 'src/interfaces/machine-learning.interface';
 import { Instrumentation } from 'src/utils/instrumentation';
 
@@ -16,8 +19,8 @@ const errorPrefix = 'Machine learning request';
 @Instrumentation()
 @Injectable()
 export class MachineLearningRepository implements IMachineLearningRepository {
-  private async predict<T>(url: string, input: TextModelInput | VisionModelInput, config: ModelConfig): Promise<T> {
-    const formData = await this.getFormData(input, config);
+  private async predict<T>(url: string, payload: ModelPayload, config: MachineLearningRequest): Promise<T> {
+    const formData = await this.getFormData(payload, config);
 
     const res = await fetch(new URL('/predict', url), { method: 'POST', body: formData }).catch(
       (error: Error | any) => {
@@ -26,50 +29,46 @@ export class MachineLearningRepository implements IMachineLearningRepository {
     );
 
     if (res.status >= 400) {
-      const modelType = config.modelType ? ` for ${config.modelType.replace('-', ' ')}` : '';
-      throw new Error(`${errorPrefix}${modelType} failed with status ${res.status}: ${res.statusText}`);
+      throw new Error(`${errorPrefix} '${JSON.stringify(config)}' failed with status ${res.status}: ${res.statusText}`);
     }
     return res.json();
   }
 
-  detectFaces(url: string, input: VisionModelInput, config: RecognitionConfig): Promise<DetectFaceResult[]> {
-    return this.predict<DetectFaceResult[]>(url, input, { ...config, modelType: ModelType.FACIAL_RECOGNITION });
+  async detectFaces(url: string, imagePath: string, { modelName, minScore }: FaceDetectionOptions) {
+    const request = {
+      [ModelTask.FACIAL_RECOGNITION]: {
+        [ModelType.DETECTION]: { modelName, minScore },
+        [ModelType.RECOGNITION]: { modelName },
+      },
+    };
+    const response = await this.predict<FacialRecognitionResponse>(url, { imagePath }, request);
+    return {
+      imageHeight: response.imageHeight,
+      imageWidth: response.imageWidth,
+      faces: response[ModelTask.FACIAL_RECOGNITION],
+    };
   }
 
-  encodeImage(url: string, input: VisionModelInput, config: CLIPConfig): Promise<number[]> {
-    return this.predict<number[]>(url, input, {
-      ...config,
-      modelType: ModelType.CLIP,
-      mode: CLIPMode.VISION,
-    } as CLIPConfig);
+  async encodeImage(url: string, imagePath: string, { modelName }: CLIPConfig) {
+    const request = { [ModelTask.SEARCH]: { [ModelType.VISUAL]: { modelName } } };
+    const response = await this.predict<ClipVisualResponse>(url, { imagePath }, request);
+    return response[ModelTask.SEARCH];
   }
 
-  encodeText(url: string, input: TextModelInput, config: CLIPConfig): Promise<number[]> {
-    return this.predict<number[]>(url, input, {
-      ...config,
-      modelType: ModelType.CLIP,
-      mode: CLIPMode.TEXT,
-    } as CLIPConfig);
+  async encodeText(url: string, text: string, { modelName }: CLIPConfig) {
+    const request = { [ModelTask.SEARCH]: { [ModelType.TEXTUAL]: { modelName } } };
+    const response = await this.predict<ClipTextualResponse>(url, { text }, request);
+    return response[ModelTask.SEARCH];
   }
 
-  private async getFormData(input: TextModelInput | VisionModelInput, config: ModelConfig): Promise<FormData> {
+  private async getFormData(payload: ModelPayload, config: MachineLearningRequest): Promise<FormData> {
     const formData = new FormData();
-    const { enabled, modelName, modelType, ...options } = config;
-    if (!enabled) {
-      throw new Error(`${modelType} is not enabled`);
-    }
+    formData.append('entries', JSON.stringify(config));
 
-    formData.append('modelName', modelName);
-    if (modelType) {
-      formData.append('modelType', modelType);
-    }
-    if (options) {
-      formData.append('options', JSON.stringify(options));
-    }
-    if ('imagePath' in input) {
-      formData.append('image', new Blob([await readFile(input.imagePath)]));
-    } else if ('text' in input) {
-      formData.append('text', input.text);
+    if ('imagePath' in payload) {
+      formData.append('image', new Blob([await readFile(payload.imagePath)]));
+    } else if ('text' in payload) {
+      formData.append('text', payload.text);
     } else {
       throw new Error('Invalid input');
     }
diff --git a/server/src/services/person.service.spec.ts b/server/src/services/person.service.spec.ts
index 1644c0c896..56447c8d20 100644
--- a/server/src/services/person.service.spec.ts
+++ b/server/src/services/person.service.spec.ts
@@ -7,7 +7,7 @@ import { IAssetRepository, WithoutProperty } from 'src/interfaces/asset.interfac
 import { ICryptoRepository } from 'src/interfaces/crypto.interface';
 import { IJobRepository, JobName, JobStatus } from 'src/interfaces/job.interface';
 import { ILoggerRepository } from 'src/interfaces/logger.interface';
-import { IMachineLearningRepository } from 'src/interfaces/machine-learning.interface';
+import { DetectedFaces, IMachineLearningRepository } from 'src/interfaces/machine-learning.interface';
 import { IMediaRepository } from 'src/interfaces/media.interface';
 import { IMoveRepository } from 'src/interfaces/move.interface';
 import { IPersonRepository } from 'src/interfaces/person.interface';
@@ -46,19 +46,21 @@ const responseDto: PersonResponseDto = {
 
 const statistics = { assets: 3 };
 
-const detectFaceMock = {
-  assetId: 'asset-1',
-  personId: 'person-1',
-  boundingBox: {
-    x1: 100,
-    y1: 100,
-    x2: 200,
-    y2: 200,
-  },
+const detectFaceMock: DetectedFaces = {
+  faces: [
+    {
+      boundingBox: {
+        x1: 100,
+        y1: 100,
+        x2: 200,
+        y2: 200,
+      },
+      embedding: [1, 2, 3, 4],
+      score: 0.2,
+    },
+  ],
   imageHeight: 500,
   imageWidth: 400,
-  embedding: [1, 2, 3, 4],
-  score: 0.2,
 };
 
 describe(PersonService.name, () => {
@@ -642,21 +644,13 @@ describe(PersonService.name, () => {
     it('should handle no results', async () => {
       const start = Date.now();
 
-      machineLearningMock.detectFaces.mockResolvedValue([]);
+      machineLearningMock.detectFaces.mockResolvedValue({ imageHeight: 500, imageWidth: 400, faces: [] });
       assetMock.getByIds.mockResolvedValue([assetStub.image]);
       await sut.handleDetectFaces({ id: assetStub.image.id });
       expect(machineLearningMock.detectFaces).toHaveBeenCalledWith(
         'http://immich-machine-learning:3003',
-        {
-          imagePath: assetStub.image.previewPath,
-        },
-        {
-          enabled: true,
-          maxDistance: 0.5,
-          minScore: 0.7,
-          minFaces: 3,
-          modelName: 'buffalo_l',
-        },
+        assetStub.image.previewPath,
+        expect.objectContaining({ minScore: 0.7, modelName: 'buffalo_l' }),
       );
       expect(personMock.createFaces).not.toHaveBeenCalled();
       expect(jobMock.queue).not.toHaveBeenCalled();
@@ -671,7 +665,7 @@ describe(PersonService.name, () => {
 
     it('should create a face with no person and queue recognition job', async () => {
       personMock.createFaces.mockResolvedValue([faceStub.face1.id]);
-      machineLearningMock.detectFaces.mockResolvedValue([detectFaceMock]);
+      machineLearningMock.detectFaces.mockResolvedValue(detectFaceMock);
       searchMock.searchFaces.mockResolvedValue([{ face: faceStub.face1, distance: 0.7 }]);
       assetMock.getByIds.mockResolvedValue([assetStub.image]);
       const face = {
diff --git a/server/src/services/person.service.ts b/server/src/services/person.service.ts
index de0c191667..faa65974d4 100644
--- a/server/src/services/person.service.ts
+++ b/server/src/services/person.service.ts
@@ -333,26 +333,28 @@ export class PersonService {
       return JobStatus.SKIPPED;
     }
 
-    const faces = await this.machineLearningRepository.detectFaces(
+    if (!asset.isVisible) {
+      return JobStatus.SKIPPED;
+    }
+
+    const { imageHeight, imageWidth, faces } = await this.machineLearningRepository.detectFaces(
       machineLearning.url,
-      { imagePath: asset.previewPath },
+      asset.previewPath,
       machineLearning.facialRecognition,
     );
 
     this.logger.debug(`${faces.length} faces detected in ${asset.previewPath}`);
-    this.logger.verbose(faces.map((face) => ({ ...face, embedding: `vector(${face.embedding.length})` })));
 
     if (faces.length > 0) {
       await this.jobRepository.queue({ name: JobName.QUEUE_FACIAL_RECOGNITION, data: { force: false } });
-
       const mappedFaces = faces.map((face) => ({
         assetId: asset.id,
         embedding: face.embedding,
-        imageHeight: face.imageHeight,
-        imageWidth: face.imageWidth,
+        imageHeight,
+        imageWidth,
         boundingBoxX1: face.boundingBox.x1,
-        boundingBoxX2: face.boundingBox.x2,
         boundingBoxY1: face.boundingBox.y1,
+        boundingBoxX2: face.boundingBox.x2,
         boundingBoxY2: face.boundingBox.y2,
       }));
 
diff --git a/server/src/services/search.service.ts b/server/src/services/search.service.ts
index 8c89218138..9213cc4290 100644
--- a/server/src/services/search.service.ts
+++ b/server/src/services/search.service.ts
@@ -102,12 +102,7 @@ export class SearchService {
 
     const userIds = await this.getUserIdsToSearch(auth);
 
-    const embedding = await this.machineLearning.encodeText(
-      machineLearning.url,
-      { text: dto.query },
-      machineLearning.clip,
-    );
-
+    const embedding = await this.machineLearning.encodeText(machineLearning.url, dto.query, machineLearning.clip);
     const page = dto.page ?? 1;
     const size = dto.size || 100;
     const { hasNextPage, items } = await this.searchRepository.searchSmart(
diff --git a/server/src/services/smart-info.service.spec.ts b/server/src/services/smart-info.service.spec.ts
index 7ac6dac414..95f76edc49 100644
--- a/server/src/services/smart-info.service.spec.ts
+++ b/server/src/services/smart-info.service.spec.ts
@@ -108,8 +108,8 @@ describe(SmartInfoService.name, () => {
 
       expect(machineMock.encodeImage).toHaveBeenCalledWith(
         'http://immich-machine-learning:3003',
-        { imagePath: assetStub.image.previewPath },
-        { enabled: true, modelName: 'ViT-B-32__openai' },
+        assetStub.image.previewPath,
+        expect.objectContaining({ modelName: 'ViT-B-32__openai' }),
       );
       expect(searchMock.upsert).toHaveBeenCalledWith(assetStub.image.id, [0.01, 0.02, 0.03]);
     });
diff --git a/server/src/services/smart-info.service.ts b/server/src/services/smart-info.service.ts
index f902aa7e57..46a57c3cd0 100644
--- a/server/src/services/smart-info.service.ts
+++ b/server/src/services/smart-info.service.ts
@@ -93,9 +93,9 @@ export class SmartInfoService {
       return JobStatus.FAILED;
     }
 
-    const clipEmbedding = await this.machineLearning.encodeImage(
+    const embedding = await this.machineLearning.encodeImage(
       machineLearning.url,
-      { imagePath: asset.previewPath },
+      asset.previewPath,
       machineLearning.clip,
     );
 
@@ -104,7 +104,7 @@ export class SmartInfoService {
       await this.databaseRepository.wait(DatabaseLock.CLIPDimSize);
     }
 
-    await this.repository.upsert(asset.id, clipEmbedding);
+    await this.repository.upsert(asset.id, embedding);
 
     return JobStatus.SUCCESS;
   }