Skip to content

Commit

Permalink
increase test coverage and add typing
Browse files Browse the repository at this point in the history
  • Loading branch information
Frankie Bromage committed Nov 27, 2024
1 parent 42be95f commit 5f8b8a3
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 11 deletions.
2 changes: 1 addition & 1 deletion airbyte_cdk/destinations/vector_db_based/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ class ProcessingConfigModel(BaseModel):
omit_field_names_from_embeddings: bool = Field(
default=False,
title="Omit field names from embeddings",
description="Do not include the field names in the text that gets embedded. By default field names are embedded e.g 'user: name, user.email: email@email.com'. If set to true, only the values are embedded e.g. 'name, email@email.com'.",
description="Do not include the field names in the text that gets embedded. By default field names are embedded (e.g., 'user.name: John Doe \n user.email: john@example.com'). If set to true, only the values are embedded (e.g., 'John Doe \n john@example.com').",
always_show=True,
)
metadata_fields: Optional[List[str]] = Field(
Expand Down
21 changes: 13 additions & 8 deletions airbyte_cdk/destinations/vector_db_based/document_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,18 +163,23 @@ def _generate_document(self, record: AirbyteRecordMessage) -> Optional[Document]
relevant_fields = self._extract_relevant_fields(record, self.text_fields)
if len(relevant_fields) == 0:
return None
if not self.omit_field_names_from_embeddings:
text = stringify_dict(relevant_fields)
else:
text = self._extract_values_from_dict(relevant_fields)
text = self._generate_text_from_fields(relevant_fields)
metadata = self._extract_metadata(record)
return Document(page_content=text, metadata=metadata)

def _generate_text_from_fields(self, fields: Dict[str, Any]) -> str:
if self.omit_field_names_from_embeddings:
return self._extract_values_from_dict(fields)
else:
return stringify_dict(fields)

def _extract_values_from_dict(self, data: Union[dict, list, Any]) -> str:
if isinstance(data, dict):
return "\n".join(self._extract_values_from_dict(value) for value in data.values())
def _extract_values_from_dict(self, data: Union[Dict[Any, Any], List[Any], Any], join_char: str = '\n') -> str:
if data is None:
return ""
elif isinstance(data, dict):
return join_char.join(self._extract_values_from_dict(value) for value in data.values())
elif isinstance(data, list):
return "\n".join(self._extract_values_from_dict(item) for item in data)
return join_char.join(self._extract_values_from_dict(item) for item in data)
else:
return str(data)

Expand Down
2 changes: 1 addition & 1 deletion unit_tests/destinations/vector_db_based/config_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def test_json_schema_generation():
},
"omit_field_names_from_embeddings": {
"title": "Omit field names from embeddings",
"description": "Do not include the field names in the text that gets embedded. By default field names are embedded e.g 'user: name, user.email: email@email.com'. If set to true, only the values are embedded e.g. 'name, email@email.com'.",
"description": "Do not include the field names in the text that gets embedded. By default field names are embedded (e.g., 'user.name: John Doe \n user.email: john@example.com'). If set to true, only the values are embedded (e.g., 'John Doe \n john@example.com').",
"default": False,
"always_show": True,
"type": "boolean",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,9 @@ def test_complex_text_fields_omit_field_names():
"non_text_2": 1,
"text": "This is the regular text",
"other_nested": {"non_text": {"a": "xyz", "b": "abc"}},
"empty_list": [],
"empty_dict": {},
"large_nested": {"a": {"b": {"c": {"d": {"e": {"f": {"g": "h"}}}}}}},
},
emitted_at=1234,
)
Expand All @@ -241,6 +244,9 @@ def test_complex_text_fields_omit_field_names():
"text",
"other_nested.non_text",
"non.*.existing",
"large_nested",
"empty_list",
"empty_dict",
]
processor.metadata_fields = ["non_text", "non_text_2", "id"]
processor.omit_field_names_from_embeddings = True
Expand All @@ -254,7 +260,8 @@ def test_complex_text_fields_omit_field_names():
And another
This is the regular text
xyz
abc"""
abc
h"""
)
assert chunks[0].metadata == {
"id": 1,
Expand Down

0 comments on commit 5f8b8a3

Please sign in to comment.