Skip to content

Commit

Permalink
duplicates flow
Browse files Browse the repository at this point in the history
  • Loading branch information
githubering182 committed Aug 19, 2024
1 parent a33c7ed commit 364f8cd
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 6 deletions.
10 changes: 8 additions & 2 deletions backend-app/file/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,18 @@


class File(Model):
STATUSES: tuple = (('d', "declined"), ('a', "accepted"), ('v', "validation"))
STATUSES: tuple = (
("p", "processing"),
('v', "validation"),
('a', "accepted"),
('d', "declined"),
("u", "duplicated"),
)

id: CharField = CharField(max_length=24, primary_key=True, unique=True)
file_name: CharField = CharField(max_length=255)
file_type: CharField = CharField(max_length=10)
status: CharField = CharField(max_length=1, choices=STATUSES, default='v')
status: CharField = CharField(max_length=1, choices=STATUSES, default='p')
is_downloaded: BooleanField = BooleanField(default=False)
upload_date: DateTimeField = DateTimeField(auto_now_add=True)
update_date: DateTimeField = DateTimeField(auto_now_add=True, null=True)
Expand Down
10 changes: 6 additions & 4 deletions storage-app/src/shared/embedding_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@ class Query(Enum):
returning id;
"""
SELECT = """
select rowid, distance
from file_embedding
select fe.rowid, rf.file_id, fe.distance
from file_embedding as fe
left join rowid_file as rf
on fe.rowid = rf.id
where embedding match ?
and distance <= ?
and k = ?
Expand Down Expand Up @@ -111,11 +113,11 @@ def insert(self, cur: Cursor, file_id: str, embedding: ndarray) -> str:
return row_id

@with_transaction
def select(
def search(
self,
cur: Cursor,
embedding: ndarray
) -> list[tuple[int, float]]:
) -> list[tuple[int, str, float]]:
return cur.execute(
Query.SELECT.value,
[embedding, SIMILAR_THRESHOLD, self._k_nearest]
Expand Down
23 changes: 23 additions & 0 deletions storage-app/src/shared/worker_services.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from enum import Enum
from json import dumps
from zipfile import ZipFile, ZIP_DEFLATED
from io import BytesIO
Expand All @@ -13,6 +14,7 @@
from shared.storage_db import DataBase
from shared.app_services import Bucket
from shared.utils import emit_token
from shared.embedding_db import EmbeddingStorage
from bson import ObjectId
from typing import Any, Optional
import requests
Expand Down Expand Up @@ -126,3 +128,24 @@ async def hash(self):
case "image": self.embedding = IHash(self.file.file).embedding
case "video": self.embedding = VHash(self.file.file).embedding
case _: raise ValueError("Unsupported file type")

def search_similar(self):
result: Optional[list[tuple[int, str, float]]] = None
new_status: Optional[str] = None

with EmbeddingStorage() as storage:
try:
result = storage.search(self.embedding)
assert not result

storage.insert(self.file_id, self.embedding)
new_status = "v"

except AssertionError:
new_status = "u"

assert new_status, "Invalid status"

self.update_status(new_status)

def update_status(self, status: str): ...

0 comments on commit 364f8cd

Please sign in to comment.