diff --git a/.github/workflows/testing-unit.yaml b/.github/workflows/testing-unit.yaml index 3574b1c2..19842b24 100644 --- a/.github/workflows/testing-unit.yaml +++ b/.github/workflows/testing-unit.yaml @@ -7,6 +7,7 @@ jobs: name: Unit tests runs-on: ubuntu-latest strategy: + fail-fast: false matrix: python-version: - 3.8 @@ -28,8 +29,11 @@ jobs: with: include_grpc: '${{ matrix.use_grpc }}' include_types: true - - name: Run unit tests + - name: Run unit tests (REST) run: poetry run pytest --cov=pinecone --timeout=120 tests/unit + - name: Run unit tests (GRPC) + if: ${{ matrix.use_grpc == true }} + run: poetry run pytest --cov=pinecone/grpc --timeout=120 tests/unit_grpc - name: mypy check env: INCLUDE_GRPC: '${{ matrix.use_grpc }}' diff --git a/pinecone/data/__init__.py b/pinecone/data/__init__.py index 5144527c..4caf309e 100644 --- a/pinecone/data/__init__.py +++ b/pinecone/data/__init__.py @@ -1,5 +1,5 @@ from .index import * -from .vector_factory import ( +from .errors import ( VectorDictionaryMissingKeysError, VectorDictionaryExcessKeysError, VectorTupleLengthError, diff --git a/pinecone/data/errors.py b/pinecone/data/errors.py new file mode 100644 index 00000000..7749a9cf --- /dev/null +++ b/pinecone/data/errors.py @@ -0,0 +1,37 @@ +from ..utils.constants import REQUIRED_VECTOR_FIELDS, OPTIONAL_VECTOR_FIELDS + +class VectorDictionaryMissingKeysError(ValueError): + def __init__(self, item): + message = f"Vector dictionary is missing required fields: {list(REQUIRED_VECTOR_FIELDS - set(item.keys()))}" + super().__init__(message) + +class VectorDictionaryExcessKeysError(ValueError): + def __init__(self, item): + invalid_keys = list(set(item.keys()) - (REQUIRED_VECTOR_FIELDS | OPTIONAL_VECTOR_FIELDS)) + message = f"Found excess keys in the vector dictionary: {invalid_keys}. The allowed keys are: {list(REQUIRED_VECTOR_FIELDS | OPTIONAL_VECTOR_FIELDS)}" + super().__init__(message) + +class VectorTupleLengthError(ValueError): + def __init__(self, item): + message = f"Found a tuple of length {len(item)} which is not supported. Vectors can be represented as tuples either the form (id, values, metadata) or (id, values). To pass sparse values please use either dicts or Vector objects as inputs." + super().__init__(message) + +class SparseValuesTypeError(ValueError, TypeError): + def __init__(self): + message = "Found unexpected data in column `sparse_values`. Expected format is `'sparse_values': {'indices': List[int], 'values': List[float]}`." + super().__init__(message) + +class SparseValuesMissingKeysError(ValueError): + def __init__(self, sparse_values_dict): + message = f"Missing required keys in data in column `sparse_values`. Expected format is `'sparse_values': {{'indices': List[int], 'values': List[float]}}`. Found keys {list(sparse_values_dict.keys())}" + super().__init__(message) + +class SparseValuesDictionaryExpectedError(ValueError, TypeError): + def __init__(self, sparse_values_dict): + message = f"Column `sparse_values` is expected to be a dictionary, found {type(sparse_values_dict)}" + super().__init__(message) + +class MetadataDictionaryExpectedError(ValueError, TypeError): + def __init__(self, item): + message = f"Column `metadata` is expected to be a dictionary, found {type(item['metadata'])}" + super().__init__(message) diff --git a/pinecone/data/sparse_vector_factory.py b/pinecone/data/sparse_vector_factory.py new file mode 100644 index 00000000..e759ffdd --- /dev/null +++ b/pinecone/data/sparse_vector_factory.py @@ -0,0 +1,54 @@ +import numbers + +from collections.abc import Mapping +from typing import Union, Dict + +from ..utils import convert_to_list + +from .errors import ( + SparseValuesTypeError, + SparseValuesMissingKeysError, + SparseValuesDictionaryExpectedError +) + +from pinecone.core.client.models import ( + SparseValues +) + +class SparseValuesFactory: + @staticmethod + def build(input: Union[Dict, SparseValues]) -> SparseValues: + if input is None: + return input + if isinstance(input, SparseValues): + return input + if not isinstance(input, Mapping): + raise SparseValuesDictionaryExpectedError(input) + if not {"indices", "values"}.issubset(input): + raise SparseValuesMissingKeysError(input) + + indices = SparseValuesFactory._convert_to_list(input.get("indices"), int) + values = SparseValuesFactory._convert_to_list(input.get("values"), float) + + if len(indices) != len(values): + raise ValueError("Sparse values indices and values must have the same length") + + try: + return SparseValues(indices=indices, values=values) + except TypeError as e: + raise SparseValuesTypeError() from e + + @staticmethod + def _convert_to_list(input, expected_type): + try: + converted = convert_to_list(input) + except TypeError as e: + raise SparseValuesTypeError() from e + + SparseValuesFactory._validate_list_items_type(converted, expected_type) + return converted + + @staticmethod + def _validate_list_items_type(input, expected_type): + if len(input) > 0 and not isinstance(input[0], expected_type): + raise SparseValuesTypeError() \ No newline at end of file diff --git a/pinecone/data/vector_factory.py b/pinecone/data/vector_factory.py index ff1f0bd6..3b486b1c 100644 --- a/pinecone/data/vector_factory.py +++ b/pinecone/data/vector_factory.py @@ -5,47 +5,19 @@ from ..utils import fix_tuple_length, convert_to_list from ..utils.constants import REQUIRED_VECTOR_FIELDS, OPTIONAL_VECTOR_FIELDS +from .sparse_vector_factory import SparseValuesFactory from pinecone.core.client.models import ( Vector, SparseValues ) -class VectorDictionaryMissingKeysError(ValueError): - def __init__(self, item): - message = f"Vector dictionary is missing required fields: {list(REQUIRED_VECTOR_FIELDS - set(item.keys()))}" - super().__init__(message) - -class VectorDictionaryExcessKeysError(ValueError): - def __init__(self, item): - invalid_keys = list(set(item.keys()) - (REQUIRED_VECTOR_FIELDS | OPTIONAL_VECTOR_FIELDS)) - message = f"Found excess keys in the vector dictionary: {invalid_keys}. The allowed keys are: {list(REQUIRED_VECTOR_FIELDS | OPTIONAL_VECTOR_FIELDS)}" - super().__init__(message) - -class VectorTupleLengthError(ValueError): - def __init__(self, item): - message = f"Found a tuple of length {len(item)} which is not supported. Vectors can be represented as tuples either the form (id, values, metadata) or (id, values). To pass sparse values please use either dicts or Vector objects as inputs." - super().__init__(message) - -class SparseValuesTypeError(ValueError, TypeError): - def __init__(self): - message = "Found unexpected data in column `sparse_values`. Expected format is `'sparse_values': {'indices': List[int], 'values': List[float]}`." - super().__init__(message) - -class SparseValuesMissingKeysError(ValueError): - def __init__(self, sparse_values_dict): - message = f"Missing required keys in data in column `sparse_values`. Expected format is `'sparse_values': {{'indices': List[int], 'values': List[float]}}`. Found keys {list(sparse_values_dict.keys())}" - super().__init__(message) - -class SparseValuesDictionaryExpectedError(ValueError, TypeError): - def __init__(self, sparse_values_dict): - message = f"Column `sparse_values` is expected to be a dictionary, found {type(sparse_values_dict)}" - super().__init__(message) - -class MetadataDictionaryExpectedError(ValueError, TypeError): - def __init__(self, item): - message = f"Column `metadata` is expected to be a dictionary, found {type(item['metadata'])}" - super().__init__(message) +from .errors import ( + VectorDictionaryMissingKeysError, + VectorDictionaryExcessKeysError, + VectorTupleLengthError, + MetadataDictionaryExpectedError, +) class VectorFactory: @staticmethod @@ -84,8 +56,10 @@ def _dict_to_vector(item, check_type: bool) -> Vector: item["values"] = convert_to_list(values) sparse_values = item.get("sparse_values") - if sparse_values and not isinstance(sparse_values, SparseValues): - item["sparse_values"] = VectorFactory._dict_to_sparse_values(sparse_values, check_type) + if sparse_values is None: + item.pop("sparse_values", None) + else: + item["sparse_values"] = SparseValuesFactory.build(sparse_values) metadata = item.get("metadata") if metadata and not isinstance(metadata, Mapping): @@ -97,18 +71,3 @@ def _dict_to_vector(item, check_type: bool) -> Vector: if not isinstance(item["values"], Iterable) or not isinstance(item["values"].__iter__().__next__(), numbers.Real): raise TypeError(f"Column `values` is expected to be a list of floats") raise e - - @staticmethod - def _dict_to_sparse_values(sparse_values_dict: Dict, check_type: bool) -> SparseValues: - if not isinstance(sparse_values_dict, Mapping): - raise SparseValuesDictionaryExpectedError(sparse_values_dict) - if not {"indices", "values"}.issubset(sparse_values_dict): - raise SparseValuesMissingKeysError(sparse_values_dict) - - indices = convert_to_list(sparse_values_dict.get("indices")) - values = convert_to_list(sparse_values_dict.get("values")) - - try: - return SparseValues(indices=indices, values=values, _check_type=check_type) - except TypeError: - raise SparseValuesTypeError() \ No newline at end of file diff --git a/pinecone/exceptions.py b/pinecone/exceptions.py index 24319f65..81a52f61 100644 --- a/pinecone/exceptions.py +++ b/pinecone/exceptions.py @@ -17,6 +17,10 @@ class PineconeProtocolError(PineconeException): class PineconeConfigurationError(PineconeException): """Raised when a configuration error occurs.""" +class ListConversionException(PineconeException, TypeError): + def __init__(self, message): + super().__init__(message) + __all__ = [ "PineconeConfigurationError", "PineconeProtocolError", @@ -30,4 +34,5 @@ class PineconeConfigurationError(PineconeException): "UnauthorizedException", "ForbiddenException", "ServiceException", + "ListConversionException" ] diff --git a/pinecone/grpc/sparse_values_factory.py b/pinecone/grpc/sparse_values_factory.py new file mode 100644 index 00000000..452f131d --- /dev/null +++ b/pinecone/grpc/sparse_values_factory.py @@ -0,0 +1,59 @@ +import numbers + +from collections.abc import Mapping +from typing import Union, Dict + +from ..utils import convert_to_list + +from ..data import ( + SparseValuesTypeError, + SparseValuesMissingKeysError, + SparseValuesDictionaryExpectedError +) + +from pinecone.core.grpc.protos.vector_service_pb2 import ( + SparseValues as GRPCSparseValues, +) +from pinecone import ( + SparseValues as NonGRPCSparseValues +) + +class SparseValuesFactory: + @staticmethod + def build(input: Union[Dict, GRPCSparseValues, NonGRPCSparseValues]) -> GRPCSparseValues: + if input is None: + return input + if isinstance(input, GRPCSparseValues): + return input + if isinstance(input, NonGRPCSparseValues): + return GRPCSparseValues(indices=input.indices, values=input.values) + if not isinstance(input, Mapping): + raise SparseValuesDictionaryExpectedError(input) + if not {"indices", "values"}.issubset(input): + raise SparseValuesMissingKeysError(input) + + indices = SparseValuesFactory._convert_to_list(input.get("indices"), int) + values = SparseValuesFactory._convert_to_list(input.get("values"), float) + + if len(indices) != len(values): + raise ValueError("Sparse values indices and values must have the same length") + + try: + return GRPCSparseValues(indices=indices, values=values) + except TypeError as e: + raise SparseValuesTypeError() from e + + @staticmethod + def _convert_to_list(input, expected_type): + try: + converted = convert_to_list(input) + except TypeError as e: + raise SparseValuesTypeError() from e + + SparseValuesFactory._validate_list_items_type(converted, expected_type) + return converted + + @staticmethod + def _validate_list_items_type(input, expected_type): + if len(input) > 0 and not isinstance(input[0], expected_type): + raise SparseValuesTypeError() \ No newline at end of file diff --git a/pinecone/grpc/vector_factory_grpc.py b/pinecone/grpc/vector_factory_grpc.py index c5368d73..cb36338b 100644 --- a/pinecone/grpc/vector_factory_grpc.py +++ b/pinecone/grpc/vector_factory_grpc.py @@ -12,11 +12,9 @@ VectorDictionaryMissingKeysError, VectorDictionaryExcessKeysError, VectorTupleLengthError, - SparseValuesTypeError, - SparseValuesMissingKeysError, - SparseValuesDictionaryExpectedError, MetadataDictionaryExpectedError ) +from .sparse_values_factory import SparseValuesFactory from pinecone.core.grpc.protos.vector_service_pb2 import ( Vector as GRPCVector, @@ -73,8 +71,8 @@ def _dict_to_vector(item) -> GRPCVector: raise TypeError(f"Column `values` is expected to be a list of floats") from e sparse_values = item.get("sparse_values") - if sparse_values and not isinstance(sparse_values, GRPCSparseValues): - item["sparse_values"] = VectorFactoryGRPC._dict_to_sparse_values(sparse_values) + if sparse_values != None and not isinstance(sparse_values, GRPCSparseValues): + item["sparse_values"] = SparseValuesFactory.build(sparse_values) metadata = item.get("metadata") if metadata: @@ -90,32 +88,4 @@ def _dict_to_vector(item) -> GRPCVector: except TypeError as e: if not isinstance(item["values"], Iterable) or not isinstance(item["values"].__iter__().__next__(), numbers.Real): raise TypeError(f"Column `values` is expected to be a list of floats") - raise e - - @staticmethod - def _dict_to_sparse_values(sparse_values_dict: Union[Dict, GRPCSparseValues, NonGRPCSparseValues]) -> GRPCSparseValues: - if isinstance(sparse_values_dict, GRPCSparseValues): - return sparse_values_dict - if isinstance(sparse_values_dict, NonGRPCSparseValues): - return GRPCSparseValues(indices=sparse_values_dict.indices, values=sparse_values_dict.values) - - if not isinstance(sparse_values_dict, Mapping): - raise SparseValuesDictionaryExpectedError(sparse_values_dict) - if not {"indices", "values"}.issubset(sparse_values_dict): - raise SparseValuesMissingKeysError(sparse_values_dict) - - - try: - indices = convert_to_list(sparse_values_dict.get("indices")) - except TypeError as e: - raise SparseValuesTypeError() from e - - try: - values = convert_to_list(sparse_values_dict.get("values")) - except TypeError as e: - raise SparseValuesTypeError() from e - - try: - return GRPCSparseValues(indices=indices, values=values) - except TypeError as e: - raise SparseValuesTypeError() from e \ No newline at end of file + raise e \ No newline at end of file diff --git a/pinecone/utils/convert_to_list.py b/pinecone/utils/convert_to_list.py index ce28a9f5..eb57ef72 100644 --- a/pinecone/utils/convert_to_list.py +++ b/pinecone/utils/convert_to_list.py @@ -1,3 +1,5 @@ +from ..exceptions import ListConversionException + def convert_to_list(obj): class_name = obj.__class__.__name__ @@ -5,5 +7,12 @@ def convert_to_list(obj): return obj elif hasattr(obj, 'tolist') and callable(getattr(obj, 'tolist')): return obj.tolist() + elif obj is None or isinstance(obj, str) or isinstance(obj, dict): + # The string and dictionary classes in python can be passed to list() + # but they're not going to yield sensible results for our use case. + raise ListConversionException(f"Expected a list or list-like data structure, but got: {obj}") else: - return list(obj) \ No newline at end of file + try: + return list(obj) + except Exception as e: + raise ListConversionException(f"Expected a list or list-like data structure, but got: {obj}") from e \ No newline at end of file diff --git a/tests/unit/data/test_vector_factory.py b/tests/unit/data/test_vector_factory.py index bf713a66..0f1a92fa 100644 --- a/tests/unit/data/test_vector_factory.py +++ b/tests/unit/data/test_vector_factory.py @@ -3,7 +3,7 @@ import pytest from pinecone.data.vector_factory import VectorFactory -from pinecone import Vector, SparseValues +from pinecone import Vector, SparseValues, ListConversionException class TestVectorFactory: def test_build_when_returns_vector_unmodified(self): @@ -11,29 +11,43 @@ def test_build_when_returns_vector_unmodified(self): assert VectorFactory.build(vec) == vec assert VectorFactory.build(vec).__class__ == Vector - def test_build_when_tuple_with_two_values(self): - tup = ('1', [0.1, 0.2, 0.3]) + @pytest.mark.parametrize('values_array', [ + [0.1, 0.2, 0.3], + np.array([0.1, 0.2, 0.3]), + pd.array([0.1, 0.2, 0.3]) + ]) + def test_build_when_tuple_with_two_values(self, values_array): + tup = ('1', values_array) actual = VectorFactory.build(tup) expected = Vector(id='1', values=[0.1, 0.2, 0.3], metadata={}) assert actual == expected - def test_build_when_tuple_with_three_values(self): - tup = ('1', [0.1, 0.2, 0.3], {'genre': 'comedy'}) + @pytest.mark.parametrize('values_array', [ + [0.1, 0.2, 0.3], + np.array([0.1, 0.2, 0.3]), + pd.array([0.1, 0.2, 0.3]) + ]) + def test_build_when_tuple_with_three_values(self, values_array): + tup = ('1', values_array, {'genre': 'comedy'}) actual = VectorFactory.build(tup) expected = Vector(id='1', values=[0.1, 0.2, 0.3], metadata={'genre': 'comedy'}) assert actual == expected - def test_build_when_tuple_with_numpy_array(self): - tup = ('1', np.array([0.1, 0.2, 0.3]), {'genre': 'comedy'}) - actual = VectorFactory.build(tup) - expected = Vector(id='1', values=[0.1, 0.2, 0.3], metadata={'genre': 'comedy'}) - assert actual == expected + @pytest.mark.parametrize("vector_tup", [ + ("1", 'not an array'), + ("1", {}), + ("1", None), + ("1", 'not an array', {"genre": "comedy"}), + ("1", {}, {"genre": "comedy"}), + ("1", None, {"genre": "comedy"}), + ]) + def test_build_when_tuple_values_must_be_list(self, vector_tup): + with pytest.raises( + ListConversionException, + match="Expected a list or list-like data structure", + ): + VectorFactory.build(vector_tup) - def test_build_when_tuple_with_pandas_array(self): - tup = ('1', pd.array([0.1, 0.2, 0.3])) - actual = VectorFactory.build(tup) - expected = Vector(id='1', values=[0.1, 0.2, 0.3], metadata={}) - assert actual == expected def test_build_when_tuple_errors_when_additional_fields(self): with pytest.raises(ValueError, match="Found a tuple of length 4 which is not supported"): @@ -45,20 +59,13 @@ def test_build_when_tuple_too_short(self): tup = ('1',) VectorFactory.build(tup) - def test_build_when_dict(self): - d = { 'id': '1', 'values': [0.1, 0.2, 0.3], 'metadata': {'genre': 'comedy'}} - actual = VectorFactory.build(d) - expected = Vector(id='1', values=[0.1, 0.2, 0.3], metadata={'genre': 'comedy'}) - assert actual == expected - - def test_build_when_dict_with_numpy_values(self): - d = { 'id': '1', 'values': np.array([0.1, 0.2, 0.3]), 'metadata': {'genre': 'comedy'}} - actual = VectorFactory.build(d) - expected = Vector(id='1', values=[0.1, 0.2, 0.3], metadata={'genre': 'comedy'}) - assert actual == expected - - def test_build_when_dict_with_pandas_values(self): - d = { 'id': '1', 'values': pd.array([0.1, 0.2, 0.3]), 'metadata': {'genre': 'comedy'}} + @pytest.mark.parametrize('values_array', [ + [0.1, 0.2, 0.3], + np.array([0.1, 0.2, 0.3]), + pd.array([0.1, 0.2, 0.3]) + ]) + def test_build_when_dict(self, values_array): + d = { 'id': '1', 'values': values_array, 'metadata': {'genre': 'comedy'}} actual = VectorFactory.build(d) expected = Vector(id='1', values=[0.1, 0.2, 0.3], metadata={'genre': 'comedy'}) assert actual == expected @@ -137,3 +144,14 @@ def test_build_when_dict_sparse_values_when_indices_is_ndarray(self): def test_build_when_errors_when_other_type(self): with pytest.raises(ValueError, match="Invalid vector value passed: cannot interpret type"): VectorFactory.build(1) + + def test_build_when_sparse_values_is_None(self): + d = { + 'id': '1', + 'values': [0.1, 0.2, 0.3], + 'metadata': {'genre': 'comedy'}, + 'sparse_values': None + } + actual = VectorFactory.build(d) + expected = Vector(id='1', values=[0.1, 0.2, 0.3], metadata={'genre': 'comedy'}) + assert actual == expected \ No newline at end of file diff --git a/tests/unit/utils/test_convert_to_list.py b/tests/unit/utils/test_convert_to_list.py index 384e169f..8107c125 100644 --- a/tests/unit/utils/test_convert_to_list.py +++ b/tests/unit/utils/test_convert_to_list.py @@ -37,3 +37,23 @@ def test_convert_to_list_when_already_list(): actual = convert_to_list(obj) expected = [1, 2, 3] assert actual == expected + +@pytest.mark.parametrize("input", [ + "", + "not a list", + {} +]) +def test_invalid_iterable_inputs(input): + with pytest.raises(TypeError, match="Expected a list or list-like data structure"): + convert_to_list(input) + +@pytest.mark.parametrize("invalid_input", [ + None, + 1, + 0, + 1.0, + True +]) +def test_invalid_non_iterable_input(invalid_input): + with pytest.raises(TypeError, match="Expected a list or list-like data structure"): + convert_to_list(invalid_input) \ No newline at end of file diff --git a/tests/unit_grpc/test_sparse_values_factory.py b/tests/unit_grpc/test_sparse_values_factory.py new file mode 100644 index 00000000..5bd6a50b --- /dev/null +++ b/tests/unit_grpc/test_sparse_values_factory.py @@ -0,0 +1,88 @@ +import pytest +from pinecone.grpc import SparseValues as GRPCSparseValues +from pinecone import SparseValues as NonGRPCSparseValues + +import numpy as np +import pandas as pd + + +from pinecone.grpc.sparse_values_factory import SparseValuesFactory + +class TestSparseValuesFactory: + def test_build_when_None(self): + assert SparseValuesFactory.build(None) == None + + def test_build_when_passed_GRPCSparseValues(self): + """ + Return without modification when given GRPCSparseValues + """ + sv = GRPCSparseValues(indices=[0, 2], values=[0.1, 0.3]) + actual = SparseValuesFactory.build(sv) + assert actual == sv + + def test_build_when_passed_NonGRPCSparseValues(self): + """ + Convert when given NonGRPCSparseValues + """ + sv = NonGRPCSparseValues(indices=[0, 2], values=[0.1, 0.3]) + actual = SparseValuesFactory.build(sv) + expected = GRPCSparseValues(indices=[0, 2], values=[0.1, 0.3]) + assert actual == expected + + @pytest.mark.parametrize('input', [ + {'indices': [2], 'values': [0.3]}, + {'indices': [88, 102], 'values': [-0.1, 0.3]}, + {'indices': [0, 2, 4], 'values': [0.1, 0.3, 0.5]}, + {'indices': [0, 2, 4, 6], 'values': [0.1, 0.3, 0.5, 0.7]}, + ]) + def test_build_when_valid_dictionary(self, input): + actual = SparseValuesFactory.build(input) + expected = GRPCSparseValues(indices=input['indices'], values=input['values']) + assert actual == expected + + @pytest.mark.parametrize('input', [ + {'indices': np.array([0, 2]), 'values': [0.1, 0.3]}, + {'indices': [0, 2], 'values': np.array([0.1, 0.3])}, + {'indices': np.array([0, 2]), 'values': np.array([0.1, 0.3])}, + {'indices': pd.array([0, 2]), 'values': [0.1, 0.3]}, + {'indices': [0, 2], 'values': pd.array([0.1, 0.3])}, + {'indices': pd.array([0, 2]), 'values': pd.array([0.1, 0.3])}, + {'indices': np.array([0, 2]), 'values': pd.array([0.1, 0.3])}, + {'indices': pd.array([0, 2]), 'values': np.array([0.1, 0.3])}, + ]) + def test_build_when_special_data_types(self, input): + """ + Test that the factory can handle special data types like + numpy/pandas integer and float arrays. + """ + actual = SparseValuesFactory.build(input) + expected = GRPCSparseValues(indices=[0, 2], values=[0.1, 0.3]) + assert actual == expected + + @pytest.mark.parametrize('input', [ + {'indices': [2], 'values': [0.3, 0.3]}, + {'indices': [88, 102], 'values': [-0.1]}, + ]) + def test_build_when_list_sizes_dont_match(self, input): + with pytest.raises(ValueError, match="Sparse values indices and values must have the same length"): + SparseValuesFactory.build(input) + + @pytest.mark.parametrize('input', [ + {'indices': [2.0], 'values': [0.3]}, + {'indices': ['2'], 'values': [0.3]}, + {'indices': np.array([2.0]), 'values': [0.3]}, + {'indices': pd.array([2.0]), 'values': [0.3]}, + ]) + def test_build_when_non_integer_indices(self, input): + with pytest.raises(ValueError, match="Found unexpected data in column `sparse_values`"): + SparseValuesFactory.build(input) + + @pytest.mark.parametrize('input', [ + {'indices': [2], 'values': [3]}, + {'indices': [2], 'values': ['3.2']}, + {'indices': [2], 'values': np.array([3])}, + {'indices': [2], 'values': pd.array([3])}, + ]) + def test_build_when_non_float_values(self, input): + with pytest.raises(ValueError, match="Found unexpected data in column `sparse_values`"): + SparseValuesFactory.build(input) diff --git a/tests/unit_grpc/test_vector_factory_grpc.py b/tests/unit_grpc/test_vector_factory_grpc.py index 491f1c39..954ab919 100644 --- a/tests/unit_grpc/test_vector_factory_grpc.py +++ b/tests/unit_grpc/test_vector_factory_grpc.py @@ -2,29 +2,31 @@ import pandas as pd import pytest +from collections.abc import Iterable, Mapping + from pinecone.grpc.vector_factory_grpc import VectorFactoryGRPC from pinecone.grpc import Vector, SparseValues from pinecone.grpc.utils import dict_to_proto_struct from pinecone import Vector as NonGRPCVector, SparseValues as NonGRPCSparseValues -class TestVectorFactory: +class TestVectorFactoryGRPC: def test_build_when_returns_vector_unmodified(self): vec = Vector(id="1", values=[0.1, 0.2, 0.3]) assert VectorFactoryGRPC.build(vec) == vec assert VectorFactoryGRPC.build(vec).__class__ == Vector - def test_build_when_nongrpc_vector(self): + def test_build_when_nongrpc_vector_it_converts(self): vec = NonGRPCVector(id="1", values=[0.1, 0.2, 0.3]) assert VectorFactoryGRPC.build(vec) == Vector(id="1", values=[0.1, 0.2, 0.3], metadata=dict_to_proto_struct({})) - def test_build_when_nongrpc_vector_with_metadata(self): + def test_build_when_nongrpc_vector_with_metadata_it_converts(self): vec = NonGRPCVector(id="1", values=[0.1, 0.2, 0.3], metadata={"genre": "comedy"}) assert VectorFactoryGRPC.build(vec) == Vector( id="1", values=[0.1, 0.2, 0.3], metadata=dict_to_proto_struct({"genre": "comedy"}) ) - def test_build_when_nongrpc_vector_with_sparse_values(self): + def test_build_when_nongrpc_vector_with_sparse_values_it_converts(self): vec = NonGRPCVector( id="1", values=[0.1, 0.2, 0.3], sparse_values=NonGRPCSparseValues(indices=[0, 2], values=[0.1, 0.3]) ) @@ -35,20 +37,37 @@ def test_build_when_nongrpc_vector_with_sparse_values(self): sparse_values=SparseValues(indices=[0, 2], values=[0.1, 0.3]), ) - def test_build_when_tuple_with_two_values(self): - tup = ("1", [0.1, 0.2, 0.3]) - actual = VectorFactoryGRPC.build(tup) - expected = Vector(id="1", values=[0.1, 0.2, 0.3], metadata={}) - assert actual == expected - - def test_build_when_tuple_with_three_values(self): - tup = ("1", [0.1, 0.2, 0.3], {"genre": "comedy"}) + @pytest.mark.parametrize("values_array", [ + [0.1, 0.2, 0.3], + np.array([0.1, 0.2, 0.3]), + pd.array([0.1, 0.2, 0.3]) + ]) + def test_build_when_tuple_with_two_values(self, values_array): + tup = ("1", values_array) actual = VectorFactoryGRPC.build(tup) - expected = Vector(id="1", values=[0.1, 0.2, 0.3], metadata=dict_to_proto_struct({"genre": "comedy"})) + expected = Vector(id="1", values=[0.1, 0.2, 0.3], metadata=dict_to_proto_struct({})) assert actual == expected - def test_build_when_tuple_with_numpy_array(self): - tup = ("1", np.array([0.1, 0.2, 0.3]), {"genre": "comedy"}) + @pytest.mark.parametrize("vector_tup", [ + ("1", 'not an array'), + ("1", {}), + ("1", 'not an array', {"genre": "comedy"}), + ("1", {}, {"genre": "comedy"}) + ]) + def test_build_when_tuple_values_must_be_list(self, vector_tup): + with pytest.raises( + TypeError, + match="Expected a list or list-like data structure", + ): + VectorFactoryGRPC.build(vector_tup) + + @pytest.mark.parametrize("values_array", [ + [0.1, 0.2, 0.3], + np.array([0.1, 0.2, 0.3]), + pd.array([0.1, 0.2, 0.3]) + ]) + def test_build_when_tuple_with_three_values(self, values_array): + tup = ("1", values_array, {"genre": "comedy"}) actual = VectorFactoryGRPC.build(tup) expected = Vector(id="1", values=[0.1, 0.2, 0.3], metadata=dict_to_proto_struct({"genre": "comedy"})) assert actual == expected @@ -62,12 +81,6 @@ def test_build_vector_with_tuple_with_sparse_values(self, sv_klass): ): VectorFactoryGRPC.build(tup) - def test_build_when_tuple_with_pandas_array(self): - tup = ("1", pd.array([0.1, 0.2, 0.3])) - actual = VectorFactoryGRPC.build(tup) - expected = Vector(id="1", values=[0.1, 0.2, 0.3], metadata=dict_to_proto_struct({})) - assert actual == expected - def test_build_when_tuple_errors_when_additional_fields(self): with pytest.raises(ValueError, match="Found a tuple of length 4 which is not supported"): tup = ("1", [0.1, 0.2, 0.3], {"a": "b"}, "extra") @@ -78,7 +91,10 @@ def test_build_when_tuple_too_short(self): tup = ("1",) VectorFactoryGRPC.build(tup) - @pytest.mark.parametrize("metadata", [{"genre": "comedy"}, dict_to_proto_struct({"genre": "comedy"})]) + @pytest.mark.parametrize("metadata", [ + {"genre": "comedy"}, + dict_to_proto_struct({"genre": "comedy"})] + ) def test_build_when_dict(self, metadata): d = {"id": "1", "values": [0.1, 0.2, 0.3], "metadata": metadata} actual = VectorFactoryGRPC.build(d) @@ -124,16 +140,12 @@ def test_build_with_dict_with_sparse_values_object(self, sv_klass): ) assert actual == expected - @pytest.mark.parametrize("metadata", [{"genre": "comedy"}, dict_to_proto_struct({"genre": "comedy"})]) - def test_build_when_dict_with_numpy_values(self, metadata): - d = {"id": "1", "values": np.array([0.1, 0.2, 0.3]), "metadata": metadata} - actual = VectorFactoryGRPC.build(d) - expected = Vector(id="1", values=[0.1, 0.2, 0.3], metadata=dict_to_proto_struct({"genre": "comedy"})) - assert actual == expected - - @pytest.mark.parametrize("metadata", [{"genre": "comedy"}, dict_to_proto_struct({"genre": "comedy"})]) - def test_build_when_dict_with_pandas_values(self, metadata): - d = {"id": "1", "values": pd.array([0.1, 0.2, 0.3]), "metadata": metadata} + @pytest.mark.parametrize("input_values", [ + pd.array([0.1, 0.2, 0.3]), + np.array([0.1, 0.2, 0.3]) + ]) + def test_build_when_dict_with_special_values(self, input_values): + d = {"id": "1", "values": input_values, "metadata": {"genre": "comedy"}} actual = VectorFactoryGRPC.build(d) expected = Vector(id="1", values=[0.1, 0.2, 0.3], metadata=dict_to_proto_struct({"genre": "comedy"})) assert actual == expected @@ -148,12 +160,20 @@ def test_build_when_dict_excess_keys(self): d = {"id": "1", "values": [0.1, 0.2, 0.3], "metadata": {"genre": "comedy"}, "extra": "field"} VectorFactoryGRPC.build(d) - def test_build_when_dict_sparse_values(self): + @pytest.mark.parametrize("sv_indices,sv_values", [ + ([0, 2], [0.1, 0.3]), + (pd.array([0, 2]), [0.1, 0.3]), + ([0, 2], pd.array([0.1, 0.3])), + (pd.array([0, 2]), pd.array([0.1, 0.3])), + (np.array([0, 2]), [0.1, 0.3]), + ([0, 2], np.array([0.1, 0.3])) + ]) + def test_build_when_dict_sparse_values(self, sv_indices, sv_values): d = { "id": "1", "values": [0.1, 0.2, 0.3], "metadata": {"genre": "comedy"}, - "sparse_values": {"indices": [0, 2], "values": [0.1, 0.3]}, + "sparse_values": {"indices": sv_indices, "values": sv_values}, } actual = VectorFactoryGRPC.build(d) expected = Vector( @@ -180,115 +200,84 @@ def test_build_when_dict_sparse_values_when_SparseValues(self): ) assert actual == expected - def test_build_when_dict_sparse_values_errors_when_not_dict(self): - with pytest.raises(ValueError, match="Column `sparse_values` is expected to be a dictionary"): - d = {"id": "1", "values": [0.1, 0.2, 0.3], "metadata": {"genre": "comedy"}, "sparse_values": "not a dict"} - VectorFactoryGRPC.build(d) - - def test_build_when_dict_sparse_values_errors_when_missing_indices(self): - with pytest.raises(ValueError, match="Missing required keys in data in column `sparse_values`"): + @pytest.mark.parametrize("bogus_sparse_values", [ + 1, + "not an array", + [1, 2], + {} + ]) + def test_build_when_dict_sparse_values_errors_when_invalid_sparse_values_values(self, bogus_sparse_values): + with pytest.raises(ValueError, match="Found unexpected data in column `sparse_values`"): d = { "id": "1", "values": [0.1, 0.2, 0.3], "metadata": {"genre": "comedy"}, - "sparse_values": {"values": [0.1, 0.3]}, + "sparse_values": {"indices": [1, 2], "values": bogus_sparse_values}, } VectorFactoryGRPC.build(d) - def test_build_when_dict_sparse_values_errors_when_missing_values(self): - with pytest.raises(ValueError, match="Missing required keys in data in column `sparse_values`"): + @pytest.mark.parametrize("bogus_sparse_indices", [ + 1, + "not an array", + [0.1, 0.2], + {} + ]) + def test_build_when_dict_sparse_values_errors_when_indices_not_valid_list(self, bogus_sparse_indices): + with pytest.raises(ValueError, match="Found unexpected data in column `sparse_values`"): d = { "id": "1", "values": [0.1, 0.2, 0.3], "metadata": {"genre": "comedy"}, - "sparse_values": {"indices": [0, 2]}, + "sparse_values": {"indices": bogus_sparse_indices, "values": [0.1, 0.1]}, } VectorFactoryGRPC.build(d) - def test_build_when_dict_sparse_values_errors_when_indices_not_list(self): - with pytest.raises(ValueError, match="Found unexpected data in column `sparse_values`"): + def test_build_when_errors_when_other_type(self): + with pytest.raises(ValueError, match="Invalid vector value passed: cannot interpret type"): + VectorFactoryGRPC.build(1) + + @pytest.mark.parametrize("bogus_sparse_values", [ + 1, + "not a dict", + [1, 2, 3], + [], + ]) + def test_build_when_invalid_sparse_values_type_in_dict(self, bogus_sparse_values): + with pytest.raises(ValueError, match="Column `sparse_values` is expected to be a dictionary"): d = { - "id": "1", - "values": [0.1, 0.2, 0.3], - "metadata": {"genre": "comedy"}, - "sparse_values": {"indices": "not a list", "values": [0.1, 0.3]}, + 'id': '1', + 'values': [0.1, 0.2, 0.3], + 'metadata': {'genre': 'comedy'}, + 'sparse_values': bogus_sparse_values # not a valid dict } VectorFactoryGRPC.build(d) - def test_build_when_dict_sparse_values_errors_when_values_not_list(self): - with pytest.raises(ValueError, match="Found unexpected data in column `sparse_values`"): + @pytest.mark.parametrize("bogus_sparse_values", [ + {}, + {'indices': [0, 2]}, + {'values': [0.1, 0.3]}, + ]) + def test_build_when_missing_keys_in_sparse_values_dict(self, bogus_sparse_values): + with pytest.raises(ValueError, match="Missing required keys in data in column `sparse_values`"): d = { - "id": "1", - "values": [0.1, 0.2, 0.3], - "metadata": {"genre": "comedy"}, - "sparse_values": {"indices": [0, 2], "values": "not a list"}, + 'id': '1', + 'values': [0.1, 0.2, 0.3], + 'metadata': {'genre': 'comedy'}, + 'sparse_values': bogus_sparse_values } VectorFactoryGRPC.build(d) - def test_build_when_dict_sparse_values_when_values_is_ndarray(self): - d = { - "id": "1", - "values": [0.1, 0.2, 0.3], - "metadata": {"genre": "comedy"}, - "sparse_values": {"indices": [0, 2], "values": np.array([0.1, 0.3])}, - } - actual = VectorFactoryGRPC.build(d) - expected = Vector( - id="1", - values=[0.1, 0.2, 0.3], - metadata=dict_to_proto_struct({"genre": "comedy"}), - sparse_values=SparseValues(indices=[0, 2], values=[0.1, 0.3]), - ) - assert actual == expected - - def test_build_when_dict_sparse_values_when_indices_is_pandas_IntegerArray(self): - d = { - "id": "1", - "values": [0.1, 0.2, 0.3], - "metadata": {"genre": "comedy"}, - "sparse_values": {"indices": pd.array([0, 2]), "values": [0.1, 0.3]}, - } - actual = VectorFactoryGRPC.build(d) - expected = Vector( - id="1", - values=[0.1, 0.2, 0.3], - metadata=dict_to_proto_struct({"genre": "comedy"}), - sparse_values=SparseValues(indices=[0, 2], values=[0.1, 0.3]), - ) - assert actual == expected - - def test_build_when_dict_sparse_values_when_values_is_pandas_FloatingArray(self): - d = { - "id": "1", - "values": [0.1, 0.2, 0.3], - "metadata": {"genre": "comedy"}, - "sparse_values": {"indices": [0, 2], "values": pd.array([0.1, 0.3])}, - } - actual = VectorFactoryGRPC.build(d) - expected = Vector( - id="1", - values=[0.1, 0.2, 0.3], - metadata=dict_to_proto_struct({"genre": "comedy"}), - sparse_values=SparseValues(indices=[0, 2], values=[0.1, 0.3]), - ) - assert actual == expected - - def test_build_when_dict_sparse_values_when_indices_is_ndarray(self): + def test_build_when_sparse_values_is_None(self): d = { - "id": "1", - "values": [0.1, 0.2, 0.3], - "metadata": {"genre": "comedy"}, - "sparse_values": {"indices": np.array([0, 2]), "values": [0.1, 0.3]}, + 'id': '1', + 'values': [0.1, 0.2, 0.3], + 'metadata': {'genre': 'comedy'}, + 'sparse_values': None } actual = VectorFactoryGRPC.build(d) expected = Vector( - id="1", - values=[0.1, 0.2, 0.3], - metadata=dict_to_proto_struct({"genre": "comedy"}), - sparse_values=SparseValues(indices=[0, 2], values=[0.1, 0.3]), + id='1', + values=[0.1, 0.2, 0.3], + metadata=dict_to_proto_struct({'genre': 'comedy'}) ) - assert actual == expected - - def test_build_when_errors_when_other_type(self): - with pytest.raises(ValueError, match="Invalid vector value passed: cannot interpret type"): - VectorFactoryGRPC.build(1) + assert actual == expected \ No newline at end of file