Skip to content

Commit

Permalink
ensure no duplicates are returned when we chunk for the user
Browse files Browse the repository at this point in the history
  • Loading branch information
haakonvt committed Jul 25, 2024
1 parent 8a706f3 commit ef12e48
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 8 deletions.
17 changes: 10 additions & 7 deletions cognite/client/_api/relationships.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
)
from cognite.client.data_classes.labels import LabelFilter
from cognite.client.data_classes.relationships import RelationshipCore
from cognite.client.utils._auxiliary import is_unlimited, split_into_chunks
from cognite.client.utils._auxiliary import is_unlimited, remove_duplicates_keep_order, split_into_chunks
from cognite.client.utils._identifier import IdentifierSequence
from cognite.client.utils._validation import assert_type, process_data_set_ids
from cognite.client.utils.useful_types import SequenceNotStr
Expand Down Expand Up @@ -293,8 +293,11 @@ def list(
labels=labels,
).dump(camel_case=True)

target_external_ids, source_external_ids = target_external_ids or [], source_external_ids or []
if all(len(xids) <= self._LIST_SUBQUERY_LIMIT for xids in (target_external_ids, source_external_ids)):
# The API is fine with duplicated target/source IDs - but since we fetch in chunks, we must ensure
# we don't ask for the same across chunks so that we don't return duplicates back to the user:
unique_target_xids = remove_duplicates_keep_order(target_external_ids or [])
unique_source_xids = remove_duplicates_keep_order(source_external_ids or [])
if all(len(xids) <= self._LIST_SUBQUERY_LIMIT for xids in (unique_target_xids, unique_source_xids)):
return self._list(
list_cls=RelationshipList,
resource_cls=Relationship,
Expand All @@ -309,17 +312,17 @@ def list(
f"Querying more than {self._LIST_SUBQUERY_LIMIT} source_external_ids/target_external_ids is only "
f"supported for unlimited queries (pass -1 / None / inf instead of {limit})"
)
target_chunks = split_into_chunks(target_external_ids, self._LIST_SUBQUERY_LIMIT) or [[]]
source_chunks = split_into_chunks(source_external_ids, self._LIST_SUBQUERY_LIMIT) or [[]]
target_chunks = split_into_chunks(unique_target_xids, self._LIST_SUBQUERY_LIMIT) or [[]]
source_chunks = split_into_chunks(unique_source_xids, self._LIST_SUBQUERY_LIMIT) or [[]]

# All sources (if any) must be checked against all targets (if any). When either is not
# given, we must exhaustively list all matching just the source or the target:
results = []
for target_xids, source_xids in itertools.product(target_chunks, source_chunks):
task_filter = filter.copy()
if target_external_ids: # keep null if it was
if unique_target_xids: # keep null if it was
task_filter["targetExternalIds"] = target_xids
if source_external_ids:
if unique_source_xids:
task_filter["sourceExternalIds"] = source_xids
results.extend(
self._list(
Expand Down
2 changes: 1 addition & 1 deletion cognite/client/utils/_auxiliary.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def find_duplicates(seq: Iterable[THashable]) -> set[THashable]:
return {x for x in seq if x in seen or add(x)}


def remove_duplicates_keep_order(seq: Sequence[THashable]) -> list[THashable]:
def remove_duplicates_keep_order(seq: SequenceNotStr[THashable]) -> list[THashable]:
seen: set[THashable] = set()
add = seen.add
return [x for x in seq if x not in seen and not add(x)]
Expand Down

0 comments on commit ef12e48

Please sign in to comment.