mirror of
https://github.com/immich-app/immich.git
synced 2025-06-16 21:38:28 +02:00
refactor: migrate person repository to kysely (#15242)
* refactor: migrate person repository to kysely * `asVector` begone * linting * fix metadata faces * update test --------- Co-authored-by: Alex <alex.tran1502@gmail.com> Co-authored-by: mertalev <101130780+mertalev@users.noreply.github.com>
This commit is contained in:
parent
0c152366ec
commit
332a865ce6
29 changed files with 715 additions and 747 deletions
machine-learning/app
|
@ -10,7 +10,7 @@ from tokenizers import Encoding, Tokenizer
|
|||
|
||||
from app.config import log
|
||||
from app.models.base import InferenceModel
|
||||
from app.models.transforms import clean_text
|
||||
from app.models.transforms import clean_text, serialize_np_array
|
||||
from app.schemas import ModelSession, ModelTask, ModelType
|
||||
|
||||
|
||||
|
@ -18,9 +18,9 @@ class BaseCLIPTextualEncoder(InferenceModel):
|
|||
depends = []
|
||||
identity = (ModelType.TEXTUAL, ModelTask.SEARCH)
|
||||
|
||||
def _predict(self, inputs: str, **kwargs: Any) -> NDArray[np.float32]:
|
||||
def _predict(self, inputs: str, **kwargs: Any) -> str:
|
||||
res: NDArray[np.float32] = self.session.run(None, self.tokenize(inputs))[0][0]
|
||||
return res
|
||||
return serialize_np_array(res)
|
||||
|
||||
def _load(self) -> ModelSession:
|
||||
session = super()._load()
|
||||
|
|
|
@ -10,7 +10,15 @@ from PIL import Image
|
|||
|
||||
from app.config import log
|
||||
from app.models.base import InferenceModel
|
||||
from app.models.transforms import crop_pil, decode_pil, get_pil_resampling, normalize, resize_pil, to_numpy
|
||||
from app.models.transforms import (
|
||||
crop_pil,
|
||||
decode_pil,
|
||||
get_pil_resampling,
|
||||
normalize,
|
||||
resize_pil,
|
||||
serialize_np_array,
|
||||
to_numpy,
|
||||
)
|
||||
from app.schemas import ModelSession, ModelTask, ModelType
|
||||
|
||||
|
||||
|
@ -18,10 +26,10 @@ class BaseCLIPVisualEncoder(InferenceModel):
|
|||
depends = []
|
||||
identity = (ModelType.VISUAL, ModelTask.SEARCH)
|
||||
|
||||
def _predict(self, inputs: Image.Image | bytes, **kwargs: Any) -> NDArray[np.float32]:
|
||||
def _predict(self, inputs: Image.Image | bytes, **kwargs: Any) -> str:
|
||||
image = decode_pil(inputs)
|
||||
res: NDArray[np.float32] = self.session.run(None, self.transform(image))[0][0]
|
||||
return res
|
||||
return serialize_np_array(res)
|
||||
|
||||
@abstractmethod
|
||||
def transform(self, image: Image.Image) -> dict[str, NDArray[np.float32]]:
|
||||
|
|
|
@ -12,7 +12,7 @@ from PIL import Image
|
|||
|
||||
from app.config import log, settings
|
||||
from app.models.base import InferenceModel
|
||||
from app.models.transforms import decode_cv2
|
||||
from app.models.transforms import decode_cv2, serialize_np_array
|
||||
from app.schemas import FaceDetectionOutput, FacialRecognitionOutput, ModelFormat, ModelSession, ModelTask, ModelType
|
||||
|
||||
|
||||
|
@ -61,7 +61,7 @@ class FaceRecognizer(InferenceModel):
|
|||
return [
|
||||
{
|
||||
"boundingBox": {"x1": x1, "y1": y1, "x2": x2, "y2": y2},
|
||||
"embedding": embedding,
|
||||
"embedding": serialize_np_array(embedding),
|
||||
"score": score,
|
||||
}
|
||||
for (x1, y1, x2, y2), embedding, score in zip(faces["boxes"], embeddings, faces["scores"])
|
||||
|
|
|
@ -4,6 +4,7 @@ from typing import IO
|
|||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import orjson
|
||||
from numpy.typing import NDArray
|
||||
from PIL import Image
|
||||
|
||||
|
@ -69,3 +70,9 @@ def clean_text(text: str, canonicalize: bool = False) -> str:
|
|||
if canonicalize:
|
||||
text = text.translate(_PUNCTUATION_TRANS).lower()
|
||||
return text
|
||||
|
||||
|
||||
# this allows the client to use the array as a string without deserializing only to serialize back to a string
|
||||
# TODO: use this in a less invasive way
|
||||
def serialize_np_array(arr: NDArray[np.float32]) -> str:
|
||||
return orjson.dumps(arr, option=orjson.OPT_SERIALIZE_NUMPY).decode()
|
||||
|
|
|
@ -79,7 +79,7 @@ class FaceDetectionOutput(TypedDict):
|
|||
|
||||
class DetectedFace(TypedDict):
|
||||
boundingBox: BoundingBox
|
||||
embedding: npt.NDArray[np.float32]
|
||||
embedding: str
|
||||
score: float
|
||||
|
||||
|
||||
|
|
|
@ -10,6 +10,7 @@ from unittest import mock
|
|||
import cv2
|
||||
import numpy as np
|
||||
import onnxruntime as ort
|
||||
import orjson
|
||||
import pytest
|
||||
from fastapi import HTTPException
|
||||
from fastapi.testclient import TestClient
|
||||
|
@ -346,11 +347,11 @@ class TestCLIP:
|
|||
mocked.run.return_value = [[self.embedding]]
|
||||
|
||||
clip_encoder = OpenClipVisualEncoder("ViT-B-32__openai", cache_dir="test_cache")
|
||||
embedding = clip_encoder.predict(pil_image)
|
||||
|
||||
assert isinstance(embedding, np.ndarray)
|
||||
assert embedding.shape[0] == clip_model_cfg["embed_dim"]
|
||||
assert embedding.dtype == np.float32
|
||||
embedding_str = clip_encoder.predict(pil_image)
|
||||
assert isinstance(embedding_str, str)
|
||||
embedding = orjson.loads(embedding_str)
|
||||
assert isinstance(embedding, list)
|
||||
assert len(embedding) == clip_model_cfg["embed_dim"]
|
||||
mocked.run.assert_called_once()
|
||||
|
||||
def test_basic_text(
|
||||
|
@ -368,11 +369,11 @@ class TestCLIP:
|
|||
mocker.patch("app.models.clip.textual.Tokenizer.from_file", autospec=True)
|
||||
|
||||
clip_encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir="test_cache")
|
||||
embedding = clip_encoder.predict("test search query")
|
||||
|
||||
assert isinstance(embedding, np.ndarray)
|
||||
assert embedding.shape[0] == clip_model_cfg["embed_dim"]
|
||||
assert embedding.dtype == np.float32
|
||||
embedding_str = clip_encoder.predict("test search query")
|
||||
assert isinstance(embedding_str, str)
|
||||
embedding = orjson.loads(embedding_str)
|
||||
assert isinstance(embedding, list)
|
||||
assert len(embedding) == clip_model_cfg["embed_dim"]
|
||||
mocked.run.assert_called_once()
|
||||
|
||||
def test_openclip_tokenizer(
|
||||
|
@ -508,8 +509,11 @@ class TestFaceRecognition:
|
|||
assert isinstance(face.get("boundingBox"), dict)
|
||||
assert set(face["boundingBox"]) == {"x1", "y1", "x2", "y2"}
|
||||
assert all(isinstance(val, np.float32) for val in face["boundingBox"].values())
|
||||
assert isinstance(face.get("embedding"), np.ndarray)
|
||||
assert face["embedding"].shape[0] == 512
|
||||
embedding_str = face.get("embedding")
|
||||
assert isinstance(embedding_str, str)
|
||||
embedding = orjson.loads(embedding_str)
|
||||
assert isinstance(embedding, list)
|
||||
assert len(embedding) == 512
|
||||
assert isinstance(face.get("score", None), np.float32)
|
||||
|
||||
rec_model.get_feat.assert_called_once()
|
||||
|
@ -880,8 +884,10 @@ class TestPredictionEndpoints:
|
|||
actual = response.json()
|
||||
assert response.status_code == 200
|
||||
assert isinstance(actual, dict)
|
||||
assert isinstance(actual.get("clip", None), list)
|
||||
assert np.allclose(expected, actual["clip"])
|
||||
embedding = actual.get("clip", None)
|
||||
assert isinstance(embedding, str)
|
||||
parsed_embedding = orjson.loads(embedding)
|
||||
assert np.allclose(expected, parsed_embedding)
|
||||
|
||||
def test_clip_text_endpoint(self, responses: dict[str, Any], deployed_app: TestClient) -> None:
|
||||
expected = responses["clip"]["text"]
|
||||
|
@ -901,8 +907,10 @@ class TestPredictionEndpoints:
|
|||
actual = response.json()
|
||||
assert response.status_code == 200
|
||||
assert isinstance(actual, dict)
|
||||
assert isinstance(actual.get("clip", None), list)
|
||||
assert np.allclose(expected, actual["clip"])
|
||||
embedding = actual.get("clip", None)
|
||||
assert isinstance(embedding, str)
|
||||
parsed_embedding = orjson.loads(embedding)
|
||||
assert np.allclose(expected, parsed_embedding)
|
||||
|
||||
def test_face_endpoint(self, pil_image: Image.Image, responses: dict[str, Any], deployed_app: TestClient) -> None:
|
||||
byte_image = BytesIO()
|
||||
|
@ -933,5 +941,8 @@ class TestPredictionEndpoints:
|
|||
|
||||
for expected_face, actual_face in zip(responses["facial-recognition"], actual["facial-recognition"]):
|
||||
assert expected_face["boundingBox"] == actual_face["boundingBox"]
|
||||
assert np.allclose(expected_face["embedding"], actual_face["embedding"])
|
||||
embedding = actual_face.get("embedding", None)
|
||||
assert isinstance(embedding, str)
|
||||
parsed_embedding = orjson.loads(embedding)
|
||||
assert np.allclose(expected_face["embedding"], parsed_embedding)
|
||||
assert np.allclose(expected_face["score"], actual_face["score"])
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue