-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(Low-Code Concurrent CDK): Add ConcurrentPerPartitionCursor #111
base: main
Are you sure you want to change the base?
Changes from all commits
d326a26
37efbae
a3304b9
4ddbb84
41b029d
eb8eec8
dfcf17f
b84e68a
2038075
c77b9a2
c59ed5a
a01c0b5
357a925
79ffb77
a36726b
5ee05f1
24268e2
660da93
23d3059
f3a00ff
d6bec35
11b86a9
871f1fe
6d2343a
ed687f5
cfef872
4260415
089137f
3489c7a
301bd31
9574f8c
5ab4ee3
36c4992
b6707ef
cf5107f
df0993e
daa6873
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,9 @@ | |
ClientSideIncrementalRecordFilterDecorator, | ||
) | ||
from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor | ||
from airbyte_cdk.sources.declarative.incremental.per_partition_with_global import ( | ||
PerPartitionWithGlobalCursor, | ||
) | ||
from airbyte_cdk.sources.declarative.interpolation import InterpolatedString | ||
from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource | ||
from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( | ||
|
@@ -32,7 +35,7 @@ | |
ModelToComponentFactory, | ||
) | ||
from airbyte_cdk.sources.declarative.requesters import HttpRequester | ||
from airbyte_cdk.sources.declarative.retrievers import SimpleRetriever | ||
from airbyte_cdk.sources.declarative.retrievers import Retriever, SimpleRetriever | ||
from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( | ||
DeclarativePartitionFactory, | ||
StreamSlicerPartitionGenerator, | ||
|
@@ -230,21 +233,7 @@ def _group_streams( | |
stream_state=stream_state, | ||
) | ||
|
||
retriever = declarative_stream.retriever | ||
|
||
# This is an optimization so that we don't invoke any cursor or state management flows within the | ||
# low-code framework because state management is handled through the ConcurrentCursor. | ||
if declarative_stream and isinstance(retriever, SimpleRetriever): | ||
# Also a temporary hack. In the legacy Stream implementation, as part of the read, | ||
# set_initial_state() is called to instantiate incoming state on the cursor. Although we no | ||
# longer rely on the legacy low-code cursor for concurrent checkpointing, low-code components | ||
# like StopConditionPaginationStrategyDecorator and ClientSideIncrementalRecordFilterDecorator | ||
# still rely on a DatetimeBasedCursor that is properly initialized with state. | ||
if retriever.cursor: | ||
retriever.cursor.set_initial_state(stream_state=stream_state) | ||
# We zero it out here, but since this is a cursor reference, the state is still properly | ||
# instantiated for the other components that reference it | ||
retriever.cursor = None | ||
retriever = self._get_retriever(declarative_stream, stream_state) | ||
|
||
partition_generator = StreamSlicerPartitionGenerator( | ||
DeclarativePartitionFactory( | ||
|
@@ -304,6 +293,60 @@ def _group_streams( | |
cursor=final_state_cursor, | ||
) | ||
) | ||
elif ( | ||
incremental_sync_component_definition | ||
and incremental_sync_component_definition.get("type", "") | ||
== DatetimeBasedCursorModel.__name__ | ||
and self._stream_supports_concurrent_partition_processing( | ||
declarative_stream=declarative_stream | ||
) | ||
and hasattr(declarative_stream.retriever, "stream_slicer") | ||
and isinstance( | ||
declarative_stream.retriever.stream_slicer, PerPartitionWithGlobalCursor | ||
) | ||
): | ||
stream_state = state_manager.get_stream_state( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The inside of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added helper method |
||
stream_name=declarative_stream.name, namespace=declarative_stream.namespace | ||
) | ||
partition_router = declarative_stream.retriever.stream_slicer._partition_router | ||
|
||
perpartition_cursor = ( | ||
self._constructor.create_concurrent_cursor_from_perpartition_cursor( | ||
state_manager=state_manager, | ||
model_type=DatetimeBasedCursorModel, | ||
component_definition=incremental_sync_component_definition, | ||
stream_name=declarative_stream.name, | ||
stream_namespace=declarative_stream.namespace, | ||
config=config or {}, | ||
stream_state=stream_state, | ||
partition_router=partition_router, | ||
) | ||
) | ||
|
||
retriever = self._get_retriever(declarative_stream, stream_state) | ||
|
||
partition_generator = StreamSlicerPartitionGenerator( | ||
DeclarativePartitionFactory( | ||
declarative_stream.name, | ||
declarative_stream.get_json_schema(), | ||
retriever, | ||
self.message_repository, | ||
), | ||
perpartition_cursor, | ||
) | ||
|
||
concurrent_streams.append( | ||
DefaultStream( | ||
partition_generator=partition_generator, | ||
name=declarative_stream.name, | ||
json_schema=declarative_stream.get_json_schema(), | ||
availability_strategy=AlwaysAvailableAvailabilityStrategy(), | ||
primary_key=get_primary_key_from_stream(declarative_stream.primary_key), | ||
cursor_field=perpartition_cursor.cursor_field.cursor_field_key, | ||
logger=self.logger, | ||
cursor=perpartition_cursor, | ||
) | ||
) | ||
else: | ||
synchronous_streams.append(declarative_stream) | ||
else: | ||
|
@@ -394,6 +437,27 @@ def _stream_supports_concurrent_partition_processing( | |
return False | ||
return True | ||
|
||
def _get_retriever( | ||
self, declarative_stream: DeclarativeStream, stream_state: Mapping[str, Any] | ||
) -> Retriever: | ||
retriever = declarative_stream.retriever | ||
|
||
# This is an optimization so that we don't invoke any cursor or state management flows within the | ||
# low-code framework because state management is handled through the ConcurrentCursor. | ||
if declarative_stream and isinstance(retriever, SimpleRetriever): | ||
# Also a temporary hack. In the legacy Stream implementation, as part of the read, | ||
# set_initial_state() is called to instantiate incoming state on the cursor. Although we no | ||
# longer rely on the legacy low-code cursor for concurrent checkpointing, low-code components | ||
# like StopConditionPaginationStrategyDecorator and ClientSideIncrementalRecordFilterDecorator | ||
# still rely on a DatetimeBasedCursor that is properly initialized with state. | ||
if retriever.cursor: | ||
retriever.cursor.set_initial_state(stream_state=stream_state) | ||
# We zero it out here, but since this is a cursor reference, the state is still properly | ||
# instantiated for the other components that reference it | ||
retriever.cursor = None | ||
|
||
return retriever | ||
|
||
@staticmethod | ||
def _select_streams( | ||
streams: List[AbstractStream], configured_catalog: ConfiguredAirbyteCatalog | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -59,13 +59,11 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter): | |
|
||
def __init__( | ||
self, | ||
date_time_based_cursor: DatetimeBasedCursor, | ||
substream_cursor: Optional[Union[PerPartitionWithGlobalCursor, GlobalSubstreamCursor]], | ||
cursor: Union[DatetimeBasedCursor, PerPartitionWithGlobalCursor, GlobalSubstreamCursor], | ||
**kwargs: Any, | ||
): | ||
super().__init__(**kwargs) | ||
self._date_time_based_cursor = date_time_based_cursor | ||
self._substream_cursor = substream_cursor | ||
self._cursor = cursor | ||
|
||
def filter_records( | ||
self, | ||
|
@@ -77,7 +75,7 @@ def filter_records( | |
records = ( | ||
record | ||
for record in records | ||
if (self._substream_cursor or self._date_time_based_cursor).should_be_synced( | ||
if self._cursor.should_be_synced( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change is so beautiful that I want to cry ❤️ |
||
# Record is created on the fly to align with cursors interface; stream name is ignored as we don't need it here | ||
# Record stream name is empty cause it is not used durig the filtering | ||
Record(data=record, associated_slice=stream_slice, stream_name="") | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we filter only on list partition router until we support the global cursor part?