increase test coverage and add typing

airbytehq · Nov 27, 2024 · 5f8b8a3 · 5f8b8a3
1 parent 42be95f
commit 5f8b8a3
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 11 deletions.
diff --git a/airbyte_cdk/destinations/vector_db_based/config.py b/airbyte_cdk/destinations/vector_db_based/config.py
@@ -111,7 +111,7 @@ class ProcessingConfigModel(BaseModel):
     omit_field_names_from_embeddings: bool = Field(
         default=False,
         title="Omit field names from embeddings",
-        description="Do not include the field names in the text that gets embedded. By default field names are embedded e.g 'user: name, user.email: email@email.com'. If set to true, only the values are embedded e.g. 'name, email@email.com'.",
+        description="Do not include the field names in the text that gets embedded. By default field names are embedded (e.g., 'user.name: John Doe \n user.email: john@example.com'). If set to true, only the values are embedded (e.g., 'John Doe \n john@example.com').",
         always_show=True,
     )
     metadata_fields: Optional[List[str]] = Field(

diff --git a/airbyte_cdk/destinations/vector_db_based/document_processor.py b/airbyte_cdk/destinations/vector_db_based/document_processor.py
@@ -163,18 +163,23 @@ def _generate_document(self, record: AirbyteRecordMessage) -> Optional[Document]
         relevant_fields = self._extract_relevant_fields(record, self.text_fields)
         if len(relevant_fields) == 0:
             return None
-        if not self.omit_field_names_from_embeddings:
-            text = stringify_dict(relevant_fields)
-        else:
-            text = self._extract_values_from_dict(relevant_fields)
+        text = self._generate_text_from_fields(relevant_fields)
         metadata = self._extract_metadata(record)
         return Document(page_content=text, metadata=metadata)
+
+    def _generate_text_from_fields(self, fields: Dict[str, Any]) -> str:
+        if self.omit_field_names_from_embeddings:
+            return self._extract_values_from_dict(fields)
+        else:
+            return stringify_dict(fields)
 
-    def _extract_values_from_dict(self, data: Union[dict, list, Any]) -> str:
-        if isinstance(data, dict):
-            return "\n".join(self._extract_values_from_dict(value) for value in data.values())
+    def _extract_values_from_dict(self, data: Union[Dict[Any, Any], List[Any], Any], join_char: str = '\n') -> str:
+        if data is None:
+            return ""
+        elif isinstance(data, dict):
+            return join_char.join(self._extract_values_from_dict(value) for value in data.values())
         elif isinstance(data, list):
-            return "\n".join(self._extract_values_from_dict(item) for item in data)
+            return join_char.join(self._extract_values_from_dict(item) for item in data)
         else:
             return str(data)
 

diff --git a/unit_tests/destinations/vector_db_based/config_test.py b/unit_tests/destinations/vector_db_based/config_test.py
@@ -244,7 +244,7 @@ def test_json_schema_generation():
                     },
                     "omit_field_names_from_embeddings": {
                         "title": "Omit field names from embeddings",
-                        "description": "Do not include the field names in the text that gets embedded. By default field names are embedded e.g 'user: name, user.email: email@email.com'. If set to true, only the values are embedded e.g. 'name, email@email.com'.",
+                        "description": "Do not include the field names in the text that gets embedded. By default field names are embedded (e.g., 'user.name: John Doe \n user.email: john@example.com'). If set to true, only the values are embedded (e.g., 'John Doe \n john@example.com').",
                         "default": False,
                         "always_show": True,
                         "type": "boolean",

diff --git a/unit_tests/destinations/vector_db_based/document_processor_test.py b/unit_tests/destinations/vector_db_based/document_processor_test.py
@@ -232,6 +232,9 @@ def test_complex_text_fields_omit_field_names():
             "non_text_2": 1,
             "text": "This is the regular text",
             "other_nested": {"non_text": {"a": "xyz", "b": "abc"}},
+            "empty_list": [],
+            "empty_dict": {},
+            "large_nested": {"a": {"b": {"c": {"d": {"e": {"f": {"g": "h"}}}}}}},
         },
         emitted_at=1234,
     )
@@ -241,6 +244,9 @@ def test_complex_text_fields_omit_field_names():
         "text",
         "other_nested.non_text",
         "non.*.existing",
+        "large_nested",
+        "empty_list",
+        "empty_dict",
     ]
     processor.metadata_fields = ["non_text", "non_text_2", "id"]
     processor.omit_field_names_from_embeddings = True
@@ -254,7 +260,8 @@ def test_complex_text_fields_omit_field_names():
 And another
 This is the regular text
 xyz
-abc"""
+abc
+h"""
     )
     assert chunks[0].metadata == {
         "id": 1,