From 95b2d437ca1587d43abf1143a685c244a2450af0 Mon Sep 17 00:00:00 2001 From: siddiquebagwan-gslab Date: Mon, 11 Sep 2023 16:55:17 +0530 Subject: [PATCH 01/65] feat(ingestion/looker): Add view file-path as option in view_naming_pattern config (#8713) --- .../ingestion/source/looker/looker_common.py | 153 +++++- .../ingestion/source/looker/looker_config.py | 23 +- .../ingestion/source/looker/lookml_source.py | 75 ++- .../looker/golden_test_file_path_ingest.json | 499 ++++++++++++++++++ .../tests/integration/looker/test_looker.py | 75 ++- .../lookml/refinements_ingestion_golden.json | 353 ++++++++----- .../tests/integration/lookml/test_lookml.py | 9 + 7 files changed, 1021 insertions(+), 166 deletions(-) create mode 100644 metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 40b90d216348c7..89b1e45695c578 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -7,10 +7,26 @@ from dataclasses import dataclass, field as dataclasses_field from enum import Enum from functools import lru_cache -from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Tuple, Union +from typing import ( + TYPE_CHECKING, + Dict, + Iterable, + List, + Optional, + Sequence, + Set, + Tuple, + Union, + cast, +) from looker_sdk.error import SDKError -from looker_sdk.sdk.api40.models import LookmlModelExploreField, User, WriteQuery +from looker_sdk.sdk.api40.models import ( + LookmlModelExplore, + LookmlModelExploreField, + User, + WriteQuery, +) from pydantic.class_validators import validator import datahub.emitter.mce_builder as builder @@ -23,6 +39,7 @@ LookerCommonConfig, LookerDashboardSourceConfig, NamingPatternMapping, + ViewNamingPatternMapping, ) from datahub.ingestion.source.looker.looker_constant import IMPORTED_PROJECTS from datahub.ingestion.source.looker.looker_lib_wrapper import LookerAPI @@ -93,14 +110,16 @@ class LookerViewId: project_name: str model_name: str view_name: str + file_path: str - def get_mapping(self, config: LookerCommonConfig) -> NamingPatternMapping: - return NamingPatternMapping( + def get_mapping(self, config: LookerCommonConfig) -> ViewNamingPatternMapping: + return ViewNamingPatternMapping( platform=config.platform_name, env=config.env.lower(), project=self.project_name, model=self.model_name, name=self.view_name, + file_path=self.file_path, ) @validator("view_name") @@ -109,10 +128,35 @@ def remove_quotes(cls, v): v = v.replace('"', "").replace("`", "") return v + def preprocess_file_path(self, file_path: str) -> str: + new_file_path: str = str(file_path) + + str_to_remove: List[str] = [ + "\\.view\\.lkml$", # escape the . using \ + ] + + for pattern in str_to_remove: + new_file_path = re.sub(pattern, "", new_file_path) + + str_to_replace: Dict[str, str] = { + f"^imported_projects/{re.escape(self.project_name)}/": "", # escape any special regex character present in project-name + "/": ".", # / is not urn friendly + } + + for pattern in str_to_replace: + new_file_path = re.sub(pattern, str_to_replace[pattern], new_file_path) + + logger.debug(f"Original file path {file_path}") + logger.debug(f"After preprocessing file path {new_file_path}") + + return new_file_path + def get_urn(self, config: LookerCommonConfig) -> str: - dataset_name = config.view_naming_pattern.replace_variables( - self.get_mapping(config) - ) + n_mapping: ViewNamingPatternMapping = self.get_mapping(config) + + n_mapping.file_path = self.preprocess_file_path(n_mapping.file_path) + + dataset_name = config.view_naming_pattern.replace_variables(n_mapping) return builder.make_dataset_urn_with_platform_instance( platform=config.platform_name, @@ -135,6 +179,10 @@ class ViewFieldType(Enum): UNKNOWN = "Unknown" +class ViewFieldValue(Enum): + NOT_AVAILABLE = "NotAvailable" + + @dataclass class ViewField: name: str @@ -161,6 +209,69 @@ def create_view_project_map(view_fields: List[ViewField]) -> Dict[str, str]: return view_project_map +def get_view_file_path( + lkml_fields: List[LookmlModelExploreField], view_name: str +) -> Optional[str]: + """ + Search for the view file path on field, if found then return the file path + """ + logger.debug("Entered") + + for field in lkml_fields: + if field.view == view_name: + # This path is relative to git clone directory + logger.debug(f"Found view({view_name}) file-path {field.source_file}") + return field.source_file + + logger.debug(f"Failed to find view({view_name}) file-path") + + return None + + +def create_upstream_views_file_path_map( + view_names: Set[str], lkml_fields: List[LookmlModelExploreField] +) -> Dict[str, Optional[str]]: + """ + Create a map of view-name v/s view file path, so that later we can fetch view's file path via view-name + """ + + upstream_views_file_path: Dict[str, Optional[str]] = {} + + for view_name in view_names: + file_path: Optional[str] = get_view_file_path( + lkml_fields=lkml_fields, view_name=view_name + ) + + upstream_views_file_path[view_name] = file_path + + return upstream_views_file_path + + +def explore_field_set_to_lkml_fields( + explore: LookmlModelExplore, +) -> List[LookmlModelExploreField]: + """ + explore.fields has three variables i.e. dimensions, measures, parameters of same type i.e. LookmlModelExploreField. + This method creating a list by adding all field instance to lkml_fields + """ + lkml_fields: List[LookmlModelExploreField] = [] + + if explore.fields is None: + logger.debug(f"Explore({explore.name}) doesn't have any field") + return lkml_fields + + def empty_list( + fields: Optional[Sequence[LookmlModelExploreField]], + ) -> List[LookmlModelExploreField]: + return list(fields) if fields is not None else [] + + lkml_fields.extend(empty_list(explore.fields.dimensions)) + lkml_fields.extend(empty_list(explore.fields.measures)) + lkml_fields.extend(empty_list(explore.fields.parameters)) + + return lkml_fields + + class LookerUtil: field_type_mapping = { **POSTGRES_TYPES_MAP, @@ -457,6 +568,9 @@ class LookerExplore: upstream_views: Optional[ List[ProjectInclude] ] = None # captures the view name(s) this explore is derived from + upstream_views_file_path: Dict[str, Optional[str]] = dataclasses_field( + default_factory=dict + ) # view_name is key and file_path is value. A single file may contains multiple views joins: Optional[List[str]] = None fields: Optional[List[ViewField]] = None # the fields exposed in this explore source_file: Optional[str] = None @@ -558,6 +672,9 @@ def from_dict( description=dict.get("description"), upstream_views=upstream_views, joins=joins, + # This method is getting called from lookml_source's get_internal_workunits method + # & upstream_views_file_path is not in use in that code flow + upstream_views_file_path={}, ) @classmethod # noqa: C901 @@ -575,6 +692,10 @@ def from_api( # noqa: C901 explore = client.lookml_model_explore(model, explore_name) views: Set[str] = set() + lkml_fields: List[ + LookmlModelExploreField + ] = explore_field_set_to_lkml_fields(explore) + if explore.view_name is not None and explore.view_name != explore.name: # explore is not named after a view and is instead using a from field, which is modeled as view_name. aliased_explore = True @@ -685,6 +806,15 @@ def from_api( # noqa: C901 if view_project_map: logger.debug(f"views and their projects: {view_project_map}") + upstream_views_file_path: Dict[ + str, Optional[str] + ] = create_upstream_views_file_path_map( + lkml_fields=lkml_fields, + view_names=views, + ) + if upstream_views_file_path: + logger.debug(f"views and their file-paths: {upstream_views_file_path}") + return cls( name=explore_name, model_name=model, @@ -699,6 +829,7 @@ def from_api( # noqa: C901 ) for view_name in views ), + upstream_views_file_path=upstream_views_file_path, source_file=explore.source_file, ) except SDKError as e: @@ -791,12 +922,20 @@ def _to_metadata_events( # noqa: C901 upstreams = [] observed_lineage_ts = datetime.datetime.now(tz=datetime.timezone.utc) for view_ref in sorted(self.upstream_views): + # set file_path to ViewFieldType.UNKNOWN if file_path is not available to keep backward compatibility + # if we raise error on file_path equal to None then existing test-cases will fail as mock data doesn't have required attributes. + file_path: str = ( + cast(str, self.upstream_views_file_path[view_ref.include]) + if self.upstream_views_file_path[view_ref.include] is not None + else ViewFieldValue.NOT_AVAILABLE.value + ) view_urn = LookerViewId( project_name=view_ref.project if view_ref.project != _BASE_PROJECT_NAME else self.project_name, model_name=self.model_name, view_name=view_ref.include, + file_path=file_path, ).get_urn(config) upstreams.append( diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py index 455614c758bb93..96c405f7257d04 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py @@ -83,10 +83,21 @@ class NamingPatternMapping: name: str +@dataclasses.dataclass +class ViewNamingPatternMapping(NamingPatternMapping): + file_path: str + + class LookerNamingPattern(NamingPattern): ALLOWED_VARS = [field.name for field in dataclasses.fields(NamingPatternMapping)] +class LookerViewNamingPattern(NamingPattern): + ALLOWED_VARS = [ + field.name for field in dataclasses.fields(ViewNamingPatternMapping) + ] + + class LookerCommonConfig(DatasetSourceConfigMixin): explore_naming_pattern: LookerNamingPattern = pydantic.Field( description=f"Pattern for providing dataset names to explores. {LookerNamingPattern.allowed_docstring()}", @@ -96,13 +107,13 @@ class LookerCommonConfig(DatasetSourceConfigMixin): description=f"Pattern for providing browse paths to explores. {LookerNamingPattern.allowed_docstring()}", default=LookerNamingPattern(pattern="/{env}/{platform}/{project}/explores"), ) - view_naming_pattern: LookerNamingPattern = Field( - LookerNamingPattern(pattern="{project}.view.{name}"), - description=f"Pattern for providing dataset names to views. {LookerNamingPattern.allowed_docstring()}", + view_naming_pattern: LookerViewNamingPattern = Field( + LookerViewNamingPattern(pattern="{project}.view.{name}"), + description=f"Pattern for providing dataset names to views. {LookerViewNamingPattern.allowed_docstring()}", ) - view_browse_pattern: LookerNamingPattern = Field( - LookerNamingPattern(pattern="/{env}/{platform}/{project}/views"), - description=f"Pattern for providing browse paths to views. {LookerNamingPattern.allowed_docstring()}", + view_browse_pattern: LookerViewNamingPattern = Field( + LookerViewNamingPattern(pattern="/{env}/{platform}/{project}/views"), + description=f"Pattern for providing browse paths to views. {LookerViewNamingPattern.allowed_docstring()}", ) tag_measures_and_dimensions: bool = Field( True, diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index 1a32afa2b7fdd6..e69c3b6e601bd3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -58,6 +58,7 @@ ProjectInclude, ViewField, ViewFieldType, + ViewFieldValue, ) from datahub.ingestion.source.looker.looker_lib_wrapper import ( LookerAPI, @@ -1065,10 +1066,30 @@ def _get_fields( fields.append(field) return fields + @classmethod + def determine_view_file_path( + cls, base_folder_path: str, absolute_file_path: str + ) -> str: + + splits: List[str] = absolute_file_path.split(base_folder_path, 1) + if len(splits) != 2: + logger.debug( + f"base_folder_path({base_folder_path}) and absolute_file_path({absolute_file_path}) not matching" + ) + return ViewFieldValue.NOT_AVAILABLE.value + + file_path: str = splits[1] + logger.debug(f"file_path={file_path}") + + return file_path.strip( + "/" + ) # strip / from path to make it equivalent to source_file attribute of LookerModelExplore API + @classmethod def from_looker_dict( cls, project_name: str, + base_folder_path: str, model_name: str, looker_view: dict, connection: LookerConnectionDefinition, @@ -1083,6 +1104,7 @@ def from_looker_dict( populate_sql_logic_in_descriptions: bool = False, process_isolation_for_sql_parsing: bool = False, ) -> Optional["LookerView"]: + view_name = looker_view["name"] logger.debug(f"Handling view {view_name} in model {model_name}") # The sql_table_name might be defined in another view and this view is extending that view, @@ -1206,9 +1228,16 @@ def from_looker_dict( viewLanguage=VIEW_LANGUAGE_LOOKML, ) + file_path = LookerView.determine_view_file_path( + base_folder_path, looker_viewfile.absolute_file_path + ) + return LookerView( id=LookerViewId( - project_name=project_name, model_name=model_name, view_name=view_name + project_name=project_name, + model_name=model_name, + view_name=view_name, + file_path=file_path, ), absolute_file_path=looker_viewfile.absolute_file_path, connection=connection, @@ -1544,6 +1573,7 @@ def _construct_datalineage_urn( project_name=looker_view.id.project_name, model_name=looker_view.id.model_name, view_name=sql_table_name, + file_path=looker_view.id.file_path, ) return view_id.get_urn(self.source_config) @@ -2057,6 +2087,7 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 ) if looker_viewfile is not None: + for raw_view in looker_viewfile.views: raw_view_name = raw_view["name"] if LookerRefinementResolver.is_refinement(raw_view_name): @@ -2077,22 +2108,36 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 raw_view = looker_refinement_resolver.apply_view_refinement( raw_view=raw_view, ) - maybe_looker_view = LookerView.from_looker_dict( + + current_project_name: str = ( include.project if include.project != _BASE_PROJECT_NAME - else project_name, - model_name, - raw_view, - connectionDefinition, - looker_viewfile, - viewfile_loader, - looker_refinement_resolver, - self.reporter, - self.source_config.max_file_snippet_length, - self.source_config.parse_table_names_from_sql, - self.source_config.sql_parser, - self.source_config.extract_column_level_lineage, - self.source_config.populate_sql_logic_for_missing_descriptions, + else project_name + ) + + # if project is base project then it is available as self.base_projects_folder[_BASE_PROJECT_NAME] + base_folder_path: str = str( + self.base_projects_folder.get( + current_project_name, + self.base_projects_folder[_BASE_PROJECT_NAME], + ) + ) + + maybe_looker_view = LookerView.from_looker_dict( + project_name=current_project_name, + base_folder_path=base_folder_path, + model_name=model_name, + looker_view=raw_view, + connection=connectionDefinition, + looker_viewfile=looker_viewfile, + looker_viewfile_loader=viewfile_loader, + looker_refinement_resolver=looker_refinement_resolver, + reporter=self.reporter, + max_file_snippet_length=self.source_config.max_file_snippet_length, + parse_table_names_from_sql=self.source_config.parse_table_names_from_sql, + sql_parser_path=self.source_config.sql_parser, + extract_col_level_lineage=self.source_config.extract_column_level_lineage, + populate_sql_logic_in_descriptions=self.source_config.populate_sql_logic_for_missing_descriptions, process_isolation_for_sql_parsing=self.source_config.process_isolation_for_sql_parsing, ) except Exception as e: diff --git a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json new file mode 100644 index 00000000000000..b0f66e7b245c96 --- /dev/null +++ b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json @@ -0,0 +1,499 @@ +[ +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DashboardSnapshot": { + "urn": "urn:li:dashboard:(looker,dashboards.1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dashboard.DashboardInfo": { + "customProperties": {}, + "title": "foo", + "description": "lorem ipsum", + "charts": [], + "datasets": [], + "lastModified": { + "created": { + "time": 1586847600000, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1586847600000, + "actor": "urn:li:corpuser:unknown" + } + }, + "dashboardUrl": "https://looker.company.com/dashboards/1" + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(looker,dashboards.1)", + "changeType": "UPSERT", + "aspectName": "embed", + "aspect": { + "json": { + "renderUrl": "https://looker.company.com/embed/dashboards/1" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", + "changeType": "UPSERT", + "aspectName": "inputFields", + "aspect": { + "json": { + "fields": [ + { + "schemaFieldUrn": "urn:li:schemaField:(urn:li:chart:(looker,dashboard_elements.2),calc)", + "schemaField": { + "fieldPath": "calc", + "nullable": false, + "description": "", + "label": "foobar", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [] + }, + "isPartOfKey": false + } + }, + { + "schemaFieldUrn": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD),dim1)", + "schemaField": { + "fieldPath": "dim1", + "nullable": false, + "description": "dimension one description", + "label": "Dimensions One Label", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(looker,dashboards.1)", + "changeType": "UPSERT", + "aspectName": "inputFields", + "aspect": { + "json": { + "fields": [ + { + "schemaFieldUrn": "urn:li:schemaField:(urn:li:chart:(looker,dashboard_elements.2),calc)", + "schemaField": { + "fieldPath": "calc", + "nullable": false, + "description": "", + "label": "foobar", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [] + }, + "isPartOfKey": false + } + }, + { + "schemaFieldUrn": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD),dim1)", + "schemaField": { + "fieldPath": "dim1", + "nullable": false, + "description": "dimension one description", + "label": "Dimensions One Label", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "/prod/looker/looker_hub/explores" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "looker.explore.label": "My Explore View", + "looker.explore.file": "test_source_file.lkml" + }, + "externalUrl": "https://looker.company.com/explore/data/my_view", + "name": "My Explore View", + "description": "lorem ipsum", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,datahub-demo.views.datahub-demo.datasets.faa_flights.view.faa_flights,PROD)", + "type": "VIEW" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "my_view", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "dim1", + "nullable": false, + "description": "dimension one description", + "label": "Dimensions One Label", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Explore" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "changeType": "UPSERT", + "aspectName": "embed", + "aspect": { + "json": { + "renderUrl": "https://looker.company.com/embed/explore/data/my_view" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "looker_hub" + }, + { + "id": "explores" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { + "urn": "urn:li:tag:Dimension", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:datahub", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "com.linkedin.pegasus2avro.tag.TagProperties": { + "name": "Dimension", + "description": "A tag that is applied to all dimension fields." + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { + "urn": "urn:li:tag:Temporal", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:datahub", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "com.linkedin.pegasus2avro.tag.TagProperties": { + "name": "Temporal", + "description": "A tag that is applied to all time-based (temporal) fields such as timestamps or durations." + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { + "urn": "urn:li:tag:Measure", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:datahub", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "com.linkedin.pegasus2avro.tag.TagProperties": { + "name": "Measure", + "description": "A tag that is applied to all measures (metrics). Measures are typically the columns that you aggregate on" + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Dimension", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Measure", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Temporal", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/looker/test_looker.py b/metadata-ingestion/tests/integration/looker/test_looker.py index 9dc15fae3a23ba..ee6610cf75679c 100644 --- a/metadata-ingestion/tests/integration/looker/test_looker.py +++ b/metadata-ingestion/tests/integration/looker/test_looker.py @@ -473,9 +473,23 @@ def setup_mock_explore_unaliased_with_joins(mocked_client): ) -def setup_mock_explore(mocked_client): +def setup_mock_explore( + mocked_client: Any, additional_lkml_fields: List[LookmlModelExploreField] = [] +) -> None: mock_model = mock.MagicMock(project_name="lkml_samples") mocked_client.lookml_model.return_value = mock_model + + lkml_fields: List[LookmlModelExploreField] = [ + LookmlModelExploreField( + name="dim1", + type="string", + dimension_group=None, + description="dimension one description", + label_short="Dimensions One Label", + ) + ] + lkml_fields.extend(additional_lkml_fields) + mocked_client.lookml_model_explore.return_value = LookmlModelExplore( id="1", name="my_explore_name", @@ -484,15 +498,7 @@ def setup_mock_explore(mocked_client): view_name="underlying_view", project_name="lkml_samples", fields=LookmlModelExploreFieldset( - dimensions=[ - LookmlModelExploreField( - name="dim1", - type="string", - dimension_group=None, - description="dimension one description", - label_short="Dimensions One Label", - ) - ] + dimensions=lkml_fields, ), source_file="test_source_file.lkml", ) @@ -905,6 +911,55 @@ def test_independent_looks_ingest( ) +@freeze_time(FROZEN_TIME) +def test_file_path_in_view_naming_pattern( + pytestconfig, tmp_path, mock_time, mock_datahub_graph +): + mocked_client = mock.MagicMock() + new_recipe = get_default_recipe(output_file_path=f"{tmp_path}/looker_mces.json") + new_recipe["source"]["config"][ + "view_naming_pattern" + ] = "{project}.{file_path}.view.{name}" + + with mock.patch( + "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", + mock_datahub_graph, + ) as mock_checkpoint, mock.patch("looker_sdk.init40") as mock_sdk: + mock_checkpoint.return_value = mock_datahub_graph + + mock_sdk.return_value = mocked_client + setup_mock_dashboard(mocked_client) + setup_mock_explore( + mocked_client, + additional_lkml_fields=[ + LookmlModelExploreField( + name="dim2", + type="string", + dimension_group=None, + description="dimension one description", + label_short="Dimensions One Label", + view="underlying_view", + source_file="views/underlying_view.view.lkml", + ) + ], + ) + setup_mock_look(mocked_client) + setup_mock_external_project_view_explore(mocked_client) + + test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" + + pipeline = Pipeline.create(new_recipe) + pipeline.run() + pipeline.raise_from_status() + mce_out_file = "golden_test_file_path_ingest.json" + + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / "looker_mces.json", + golden_path=f"{test_resources_dir}/{mce_out_file}", + ) + + @freeze_time(FROZEN_TIME) def test_independent_soft_deleted_looks( pytestconfig, diff --git a/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json b/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json index dc5e1aa9096f84..ce4123fb7e93d6 100644 --- a/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json +++ b/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json @@ -2,12 +2,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.foo.view.my_view,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/prod/looker/lkml_samples/views" + "/prod/looker/lkml_samples/foo.view.lkml/views" ] } }, @@ -176,12 +176,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.foo.view.my_view,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -193,12 +194,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.foo.view.my_view,PROD)", "changeType": "UPSERT", "aspectName": "viewProperties", "aspect": { @@ -210,12 +212,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.foo.view.my_view,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -227,6 +230,9 @@ { "id": "lkml_samples" }, + { + "id": "foo.view.lkml" + }, { "id": "views" } @@ -235,18 +241,19 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.bar.view.my_derived_view,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/prod/looker/lkml_samples/views" + "/prod/looker/lkml_samples/bar.view.lkml/views" ] } }, @@ -263,7 +270,7 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.bar.view.my_view,PROD)", "type": "VIEW" } ] @@ -394,12 +401,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.bar.view.my_derived_view,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -411,12 +419,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.bar.view.my_derived_view,PROD)", "changeType": "UPSERT", "aspectName": "viewProperties", "aspect": { @@ -428,12 +437,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.bar.view.my_derived_view,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -445,6 +455,9 @@ { "id": "lkml_samples" }, + { + "id": "bar.view.lkml" + }, { "id": "views" } @@ -453,18 +466,19 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.include_able_view,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.included_view_file.view.include_able_view,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/prod/looker/lkml_samples/views" + "/prod/looker/lkml_samples/included_view_file.view.lkml/views" ] } }, @@ -501,12 +515,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.include_able_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.included_view_file.view.include_able_view,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -518,12 +533,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.include_able_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.included_view_file.view.include_able_view,PROD)", "changeType": "UPSERT", "aspectName": "viewProperties", "aspect": { @@ -535,12 +551,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.include_able_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.included_view_file.view.include_able_view,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -552,6 +569,9 @@ { "id": "lkml_samples" }, + { + "id": "included_view_file.view.lkml" + }, { "id": "views" } @@ -560,18 +580,19 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.looker_events,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view_declarations.view.looker_events,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/prod/looker/lkml_samples/views" + "/prod/looker/lkml_samples/view_declarations.view.lkml/views" ] } }, @@ -608,12 +629,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.looker_events,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view_declarations.view.looker_events,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -625,12 +647,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.looker_events,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view_declarations.view.looker_events,PROD)", "changeType": "UPSERT", "aspectName": "viewProperties", "aspect": { @@ -642,12 +665,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.looker_events,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view_declarations.view.looker_events,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -659,6 +683,9 @@ { "id": "lkml_samples" }, + { + "id": "view_declarations.view.lkml" + }, { "id": "views" } @@ -667,18 +694,19 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.extending_looker_events,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view_declarations.view.extending_looker_events,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/prod/looker/lkml_samples/views" + "/prod/looker/lkml_samples/view_declarations.view.lkml/views" ] } }, @@ -760,12 +788,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.extending_looker_events,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view_declarations.view.extending_looker_events,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -777,12 +806,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.extending_looker_events,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view_declarations.view.extending_looker_events,PROD)", "changeType": "UPSERT", "aspectName": "viewProperties", "aspect": { @@ -794,12 +824,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.extending_looker_events,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view_declarations.view.extending_looker_events,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -811,6 +842,9 @@ { "id": "lkml_samples" }, + { + "id": "view_declarations.view.lkml" + }, { "id": "views" } @@ -819,18 +853,19 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.autodetect_sql_name_based_on_view_name,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view_declarations.view.autodetect_sql_name_based_on_view_name,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/prod/looker/lkml_samples/views" + "/prod/looker/lkml_samples/view_declarations.view.lkml/views" ] } }, @@ -867,12 +902,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.autodetect_sql_name_based_on_view_name,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view_declarations.view.autodetect_sql_name_based_on_view_name,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -884,12 +920,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.autodetect_sql_name_based_on_view_name,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view_declarations.view.autodetect_sql_name_based_on_view_name,PROD)", "changeType": "UPSERT", "aspectName": "viewProperties", "aspect": { @@ -901,12 +938,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.autodetect_sql_name_based_on_view_name,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view_declarations.view.autodetect_sql_name_based_on_view_name,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -918,6 +956,9 @@ { "id": "lkml_samples" }, + { + "id": "view_declarations.view.lkml" + }, { "id": "views" } @@ -926,18 +967,19 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.test_include_external_view,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view_declarations.view.test_include_external_view,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/prod/looker/lkml_samples/views" + "/prod/looker/lkml_samples/view_declarations.view.lkml/views" ] } }, @@ -974,12 +1016,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.test_include_external_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view_declarations.view.test_include_external_view,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -991,12 +1034,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.test_include_external_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view_declarations.view.test_include_external_view,PROD)", "changeType": "UPSERT", "aspectName": "viewProperties", "aspect": { @@ -1008,12 +1052,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.test_include_external_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view_declarations.view.test_include_external_view,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1025,6 +1070,9 @@ { "id": "lkml_samples" }, + { + "id": "view_declarations.view.lkml" + }, { "id": "views" } @@ -1033,18 +1081,19 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.nested.fragment_derived.view.fragment_derived_view,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/prod/looker/lkml_samples/views" + "/prod/looker/lkml_samples/nested/fragment_derived.view.lkml/views" ] } }, @@ -1156,12 +1205,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.nested.fragment_derived.view.fragment_derived_view,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1173,12 +1223,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.nested.fragment_derived.view.fragment_derived_view,PROD)", "changeType": "UPSERT", "aspectName": "viewProperties", "aspect": { @@ -1190,12 +1241,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.nested.fragment_derived.view.fragment_derived_view,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1207,6 +1259,12 @@ { "id": "lkml_samples" }, + { + "id": "nested" + }, + { + "id": "fragment_derived.view.lkml" + }, { "id": "views" } @@ -1215,18 +1273,19 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.liquid.view.customer_facts,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/prod/looker/lkml_samples/views" + "/prod/looker/lkml_samples/liquid.view.lkml/views" ] } }, @@ -1263,12 +1322,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.liquid.view.customer_facts,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1280,12 +1340,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.liquid.view.customer_facts,PROD)", "changeType": "UPSERT", "aspectName": "viewProperties", "aspect": { @@ -1297,12 +1358,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.liquid.view.customer_facts,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1314,6 +1376,9 @@ { "id": "lkml_samples" }, + { + "id": "liquid.view.lkml" + }, { "id": "views" } @@ -1322,18 +1387,19 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.ability.view.ability,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/prod/looker/lkml_samples/views" + "/prod/looker/lkml_samples/ability.view.lkml/views" ] } }, @@ -1362,7 +1428,7 @@ ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD),pk)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.ability.view.ability,PROD),pk)" ], "confidenceScore": 1.0 } @@ -1449,12 +1515,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.ability.view.ability,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1466,12 +1533,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.ability.view.ability,PROD)", "changeType": "UPSERT", "aspectName": "viewProperties", "aspect": { @@ -1483,12 +1551,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.ability.view.ability,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1500,6 +1569,9 @@ { "id": "lkml_samples" }, + { + "id": "ability.view.lkml" + }, { "id": "views" } @@ -1508,18 +1580,19 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.owners.view.owners,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/prod/looker/lkml_samples/views" + "/prod/looker/lkml_samples/owners.view.lkml/views" ] } }, @@ -1548,7 +1621,7 @@ ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD),id)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.owners.view.owners,PROD),id)" ], "confidenceScore": 1.0 }, @@ -1559,7 +1632,7 @@ ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD),owner_name)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.owners.view.owners,PROD),owner_name)" ], "confidenceScore": 1.0 }, @@ -1570,7 +1643,7 @@ ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD),has_owner_name)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.owners.view.owners,PROD),has_owner_name)" ], "confidenceScore": 1.0 } @@ -1680,12 +1753,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.owners.view.owners,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1697,12 +1771,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.owners.view.owners,PROD)", "changeType": "UPSERT", "aspectName": "viewProperties", "aspect": { @@ -1714,12 +1789,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.owners.view.owners,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1731,6 +1807,9 @@ { "id": "lkml_samples" }, + { + "id": "owners.view.lkml" + }, { "id": "views" } @@ -1739,18 +1818,19 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.native_derived_table.view.view_derived_explore,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/prod/looker/lkml_samples/views" + "/prod/looker/lkml_samples/native_derived_table.view.lkml/views" ] } }, @@ -1779,7 +1859,7 @@ ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD),country)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.native_derived_table.view.view_derived_explore,PROD),country)" ], "confidenceScore": 1.0 }, @@ -1790,7 +1870,7 @@ ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD),city)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.native_derived_table.view.view_derived_explore,PROD),city)" ], "confidenceScore": 1.0 } @@ -1919,12 +1999,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.native_derived_table.view.view_derived_explore,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1936,12 +2017,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.native_derived_table.view.view_derived_explore,PROD)", "changeType": "UPSERT", "aspectName": "viewProperties", "aspect": { @@ -1953,12 +2035,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.native_derived_table.view.view_derived_explore,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1970,6 +2053,9 @@ { "id": "lkml_samples" }, + { + "id": "native_derived_table.view.lkml" + }, { "id": "views" } @@ -1978,18 +2064,19 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.flights.view.flights,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/prod/looker/lkml_samples/views" + "/prod/looker/lkml_samples/flights.view.lkml/views" ] } }, @@ -2018,7 +2105,7 @@ ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD),id)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.flights.view.flights,PROD),id)" ], "confidenceScore": 1.0 } @@ -2086,12 +2173,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.flights.view.flights,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2103,12 +2191,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.flights.view.flights,PROD)", "changeType": "UPSERT", "aspectName": "viewProperties", "aspect": { @@ -2120,12 +2209,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.flights.view.flights,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -2137,6 +2227,9 @@ { "id": "lkml_samples" }, + { + "id": "flights.view.lkml" + }, { "id": "views" } @@ -2145,7 +2238,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2160,7 +2254,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2175,7 +2270,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2190,7 +2286,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "lookml-test" + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/lookml/test_lookml.py b/metadata-ingestion/tests/integration/lookml/test_lookml.py index 85eb4dcd92ec9b..21a0b19849d975 100644 --- a/metadata-ingestion/tests/integration/lookml/test_lookml.py +++ b/metadata-ingestion/tests/integration/lookml/test_lookml.py @@ -99,6 +99,15 @@ def test_lookml_refinement_ingest(pytestconfig, tmp_path, mock_time): f"{tmp_path}/{mce_out_file}", f"{test_resources_dir}/lkml_samples" ) new_recipe["source"]["config"]["process_refinements"] = True + + new_recipe["source"]["config"][ + "view_naming_pattern" + ] = "{project}.{file_path}.view.{name}" + + new_recipe["source"]["config"][ + "view_browse_pattern" + ] = "/{env}/{platform}/{project}/{file_path}/views" + pipeline = Pipeline.create(new_recipe) pipeline.run() pipeline.pretty_print_summary() From 0f3819b511e7f81c85e7df474eda8d670ad19027 Mon Sep 17 00:00:00 2001 From: ukayani Date: Mon, 11 Sep 2023 11:13:55 -0400 Subject: [PATCH 02/65] feat(upgrade): add ability to provide a startingOffset for RestoreIndices (#8539) --- .../datahub/upgrade/restoreindices/RestoreIndices.java | 2 ++ .../datahub/upgrade/restoreindices/SendMAEStep.java | 9 ++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/RestoreIndices.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/RestoreIndices.java index ee6a5ed6f1536f..3c0a9762a28c92 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/RestoreIndices.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/RestoreIndices.java @@ -25,6 +25,8 @@ public class RestoreIndices implements Upgrade { public static final String URN_ARG_NAME = "urn"; public static final String URN_LIKE_ARG_NAME = "urnLike"; + public static final String STARTING_OFFSET_ARG_NAME = "startingOffset"; + private final List _steps; public RestoreIndices(final Database server, final EntityService entityService, diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/SendMAEStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/SendMAEStep.java index ce39b3fb562aff..2ac4fea2e653ac 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/SendMAEStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/SendMAEStep.java @@ -30,6 +30,8 @@ public class SendMAEStep implements UpgradeStep { private static final int DEFAULT_BATCH_SIZE = 1000; private static final long DEFAULT_BATCH_DELAY_MS = 250; + + private static final int DEFAULT_STARTING_OFFSET = 0; private static final int DEFAULT_THREADS = 1; private final Database _server; @@ -83,6 +85,7 @@ private RestoreIndicesArgs getArgs(UpgradeContext context) { result.batchSize = getBatchSize(context.parsedArgs()); result.numThreads = getThreadCount(context.parsedArgs()); result.batchDelayMs = getBatchDelayMs(context.parsedArgs()); + result.start = getStartingOffset(context.parsedArgs()); if (containsKey(context.parsedArgs(), RestoreIndices.ASPECT_NAME_ARG_NAME)) { result.aspectName = context.parsedArgs().get(RestoreIndices.ASPECT_NAME_ARG_NAME).get(); } @@ -124,7 +127,7 @@ public Function executable() { final int rowCount = getRowCount(args); context.report().addLine(String.format("Found %s latest aspects in aspects table in %.2f minutes.", rowCount, (float) (System.currentTimeMillis() - startTime) / 1000 / 60)); - int start = 0; + int start = args.start; List> futures = new ArrayList<>(); startTime = System.currentTimeMillis(); @@ -186,6 +189,10 @@ private int getBatchSize(final Map> parsedArgs) { return getInt(parsedArgs, DEFAULT_BATCH_SIZE, RestoreIndices.BATCH_SIZE_ARG_NAME); } + private int getStartingOffset(final Map> parsedArgs) { + return getInt(parsedArgs, DEFAULT_STARTING_OFFSET, RestoreIndices.STARTING_OFFSET_ARG_NAME); + } + private long getBatchDelayMs(final Map> parsedArgs) { long resolvedBatchDelayMs = DEFAULT_BATCH_DELAY_MS; if (containsKey(parsedArgs, RestoreIndices.BATCH_DELAY_MS_ARG_NAME)) { From 1efc5d984662128630a7605dcbcba8ddde3f3d02 Mon Sep 17 00:00:00 2001 From: Indy Prentice Date: Mon, 11 Sep 2023 13:31:44 -0300 Subject: [PATCH 03/65] fix(index): Do not override the search analyzer for ngram fields (#8818) Co-authored-by: Indy Prentice --- .../search/elasticsearch/indexbuilder/MappingsBuilder.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java index 4bbff3915aca93..b3e05d966e36b7 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java @@ -121,8 +121,7 @@ private static Map getMappingsForField(@Nonnull final Searchable String analyzerName = entry.getValue(); subFields.put(fieldName, ImmutableMap.of( TYPE, TEXT, - ANALYZER, analyzerName, - SEARCH_ANALYZER, analyzerName + ANALYZER, analyzerName )); } } From 486bd861d23a5657e727c24e5d1fae0846a81d33 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Mon, 11 Sep 2023 14:20:21 -0500 Subject: [PATCH 04/65] =?UTF-8?q?test(managed=5Fingestion):=20fix=20manage?= =?UTF-8?q?d=20ingestion=20test=20by=20fixing=20actions=E2=80=A6=20(#8820)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/docker-unified.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 31fead8a7ade65..13c921e953c324 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -809,7 +809,7 @@ jobs: DATAHUB_VERSION: ${{ needs.setup.outputs.unique_tag }} DATAHUB_ACTIONS_IMAGE: ${{ env.DATAHUB_INGESTION_IMAGE }} ACTIONS_VERSION: ${{ needs.datahub_ingestion_slim_build.outputs.tag }} - ACTIONS_EXTRA_PACKAGES: 'acryl-datahub-actions[executor] acryl-datahub-actions' + ACTIONS_EXTRA_PACKAGES: 'acryl-datahub-actions[executor]==0.0.13 acryl-datahub-actions==0.0.13 acryl-datahub==0.10.5' ACTIONS_CONFIG: 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml' run: | ./smoke-test/run-quickstart.sh From 5a0ce38bc6aac8fc12cc7f6ee50388b84a2b9c4e Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 11 Sep 2023 13:29:31 -0700 Subject: [PATCH 05/65] docs: add 0.11 docs to docs site (#8813) Co-authored-by: Indy Prentice --- .github/workflows/documentation.yml | 1 + docs-website/build.gradle | 4 ++++ docs-website/versions.json | 1 + 3 files changed, 6 insertions(+) diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 1cbc65f2b63700..68432a4feb13dd 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -27,6 +27,7 @@ jobs: - uses: actions/setup-python@v4 with: python-version: "3.10" + cache: pip - name: Install Python dependencies run: ./metadata-ingestion/scripts/install_deps.sh - name: Build Docs diff --git a/docs-website/build.gradle b/docs-website/build.gradle index 851c10d9ea97f1..370ae3eec91761 100644 --- a/docs-website/build.gradle +++ b/docs-website/build.gradle @@ -119,6 +119,10 @@ task yarnBuild(type: YarnTask, dependsOn: [yarnLint, yarnGenerate, downloadHisto outputs.dir("dist") // tell gradle to apply the build cache outputs.cacheIf { true } + // See https://stackoverflow.com/questions/53230823/fatal-error-ineffective-mark-compacts-near-heap-limit-allocation-failed-java + // and https://github.com/facebook/docusaurus/issues/8329. + // TODO: As suggested in https://github.com/facebook/docusaurus/issues/4765, try switching to swc-loader. + environment = ['NODE_OPTIONS': '--max-old-space-size=10248'] args = ['run', 'build'] } diff --git a/docs-website/versions.json b/docs-website/versions.json index 0b79ac9498e063..a5493c26a4c659 100644 --- a/docs-website/versions.json +++ b/docs-website/versions.json @@ -1,3 +1,4 @@ [ + "0.11.0", "0.10.5" ] From aee1e68f5db3b48e8539796ebc64b14756590e8f Mon Sep 17 00:00:00 2001 From: Indy Prentice Date: Mon, 11 Sep 2023 17:33:03 -0300 Subject: [PATCH 06/65] docs(release): Update updating-datahub.md for 0.11.0 release (#8821) Co-authored-by: Indy Prentice --- docs/how/updating-datahub.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 7ba516c82cf1b7..1ef7413a88ebde 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -8,16 +8,38 @@ This file documents any backwards-incompatible changes in DataHub and assists pe ### Potential Downtime +### Deprecations + +### Other Notable Changes + +## 0.11.0 + +### Breaking Changes + +### Potential Downtime +- #8611 Search improvements requires reindexing indices. A `system-update` job will run which will set indices to read-only and create a backup/clone of each index. During the reindexing new components will be prevented from start-up until the reindex completes. The logs of this job will indicate a % complete per index. Depending on index sizes and infrastructure this process can take 5 minutes to hours however as a rough estimate 1 hour for every 2.3 million entities. + ### Deprecations - #8525: In LDAP ingestor, the `manager_pagination_enabled` changed to general `pagination_enabled` +- MAE Events are no longer produced. MAE events have been deprecated for over a year. ### Other Notable Changes +- In this release we now enable you to create and delete pinned announcements on your DataHub homepage! If you have the “Manage Home Page Posts” platform privilege you’ll see a new section in settings called “Home Page Posts” where you can create and delete text posts and link posts that your users see on the home page. +- The new search and browse experience, which was first made available in the previous release behind a feature flag, is now on by default. Check out our release notes for v0.10.5 to get more information and documentation on this new Browse experience. +- In addition to the ranking changes mentioned above, this release includes changes to the highlighting of search entities to understand why they match your query. You can also sort your results alphabetically or by last updated times, in addition to relevance. In this release, we suggest a correction if your query has a typo in it. - #8300: Clickhouse source now inherited from TwoTierSQLAlchemy. In old way we have platform_instance -> container -> co container db (None) -> container schema and now we have platform_instance -> container database. - #8300: Added `uri_opts` argument; now we can add any options for clickhouse client. - #8659: BigQuery ingestion no longer creates DataPlatformInstance aspects by default. This will only affect users that were depending on this aspect for custom functionality, and can be enabled via the `include_data_platform_instance` config option. +- OpenAPI entity and aspect endpoints expanded to improve developer experience when using this API with additional aspects to be added in the near future. +- The CLI now supports recursive deletes. +- Batching of default aspects on initial ingestion (SQL) +- Improvements to multi-threading. Ingestion recipes, if previously reduced to 1 thread, can be restored to the 15 thread default. +- Gradle 7 upgrade moderately improves build speed +- DataHub Ingestion slim images reduced in size by 2GB+ +- Glue Schema Registry fixed ## 0.10.5 From a021053a72b79b6d028977649b05adf85fdb81f4 Mon Sep 17 00:00:00 2001 From: cjm98332 <140763930+cjm98332@users.noreply.github.com> Date: Tue, 12 Sep 2023 06:53:39 -0700 Subject: [PATCH 07/65] fix(ingest/mssql): Add UNIQUEIDENTIFIER data type as String (#8642) Co-authored-by: Andrew Sikowitz --- .../ingestion/source/sql/mssql/source.py | 7 ++++- .../golden_mces_mssql_no_db_to_file.json | 12 +++++++++ .../golden_mces_mssql_no_db_with_filter.json | 12 +++++++++ .../golden_mces_mssql_to_file.json | 12 +++++++++ ...golden_mces_mssql_with_lower_case_urn.json | 12 +++++++++ .../integration/sql_server/setup/setup.sql | 27 ++++++++++--------- 6 files changed, 68 insertions(+), 14 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py index 3c7701d93edebc..685d4fb3074c92 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py @@ -46,12 +46,17 @@ BasicSQLAlchemyConfig, make_sqlalchemy_uri, ) -from datahub.metadata.schema_classes import BooleanTypeClass, UnionTypeClass +from datahub.metadata.schema_classes import ( + BooleanTypeClass, + StringTypeClass, + UnionTypeClass, +) logger: logging.Logger = logging.getLogger(__name__) register_custom_type(sqlalchemy.dialects.mssql.BIT, BooleanTypeClass) register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, UnionTypeClass) +register_custom_type(sqlalchemy.dialects.mssql.UNIQUEIDENTIFIER, StringTypeClass) class SQLServerConfig(BasicSQLAlchemyConfig): diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json index 67a563baa561cd..a495d04c4e398c 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json @@ -1752,6 +1752,18 @@ "recursive": false, "isPartOfKey": true }, + { + "fieldPath": "SomeId", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "UNIQUEIDENTIFIER()", + "recursive": false, + "isPartOfKey": false + }, { "fieldPath": "Name", "nullable": true, diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json index ef6033dd919435..8277ff8bf7e89a 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json @@ -1752,6 +1752,18 @@ "recursive": false, "isPartOfKey": true }, + { + "fieldPath": "SomeId", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "UNIQUEIDENTIFIER()", + "recursive": false, + "isPartOfKey": false + }, { "fieldPath": "Name", "nullable": true, diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json index 8098accebb424c..f3714bba6364d0 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json @@ -1752,6 +1752,18 @@ "recursive": false, "isPartOfKey": true }, + { + "fieldPath": "SomeId", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "UNIQUEIDENTIFIER()", + "recursive": false, + "isPartOfKey": false + }, { "fieldPath": "Name", "nullable": true, diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json index d32002fb5648cc..d25d23daae2eac 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json @@ -1752,6 +1752,18 @@ "recursive": false, "isPartOfKey": true }, + { + "fieldPath": "SomeId", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "UNIQUEIDENTIFIER()", + "recursive": false, + "isPartOfKey": false + }, { "fieldPath": "Name", "nullable": true, diff --git a/metadata-ingestion/tests/integration/sql_server/setup/setup.sql b/metadata-ingestion/tests/integration/sql_server/setup/setup.sql index 2ff46e249007a6..c1347a7c8cacaf 100644 --- a/metadata-ingestion/tests/integration/sql_server/setup/setup.sql +++ b/metadata-ingestion/tests/integration/sql_server/setup/setup.sql @@ -34,7 +34,8 @@ CREATE TABLE Foo.Persons ( GO CREATE TABLE Foo.SalesReason ( - TempID int NOT NULL, + TempID int NOT NULL, + SomeId UNIQUEIDENTIFIER NOT NULL DEFAULT NEWID(), Name nvarchar(50) , CONSTRAINT PK_TempSales PRIMARY KEY NONCLUSTERED (TempID) , CONSTRAINT FK_TempSales_SalesReason FOREIGN KEY (TempID) @@ -49,20 +50,20 @@ AS SELECT @ID AS ThatDB; GO -GO -EXEC sys.sp_addextendedproperty -@name = N'MS_Description', -@value = N'Description for table Items of schema Foo.', -@level0type = N'SCHEMA', @level0name = 'Foo', -@level1type = N'TABLE', @level1name = 'Items'; +GO +EXEC sys.sp_addextendedproperty +@name = N'MS_Description', +@value = N'Description for table Items of schema Foo.', +@level0type = N'SCHEMA', @level0name = 'Foo', +@level1type = N'TABLE', @level1name = 'Items'; GO -GO -EXEC sys.sp_addextendedproperty -@name = N'MS_Description', -@value = N'Description for column LastName of table Persons of schema Foo.', -@level0type = N'SCHEMA', @level0name = 'Foo', -@level1type = N'TABLE', @level1name = 'Persons', +GO +EXEC sys.sp_addextendedproperty +@name = N'MS_Description', +@value = N'Description for column LastName of table Persons of schema Foo.', +@level0type = N'SCHEMA', @level0name = 'Foo', +@level1type = N'TABLE', @level1name = 'Persons', @level2type = N'COLUMN',@level2name = 'LastName'; GO USE msdb ; From 303a2d0863d7a86e96dd3c32655f1a044dc6bffe Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Wed, 13 Sep 2023 00:00:24 +0530 Subject: [PATCH 08/65] build(ingest): upgrade to sqlalchemy 1.4, drop 1.3 support (#8810) Co-authored-by: Harshal Sheth --- docs/how/updating-datahub.md | 1 + metadata-ingestion/build.gradle | 3 -- .../scripts/install-sqlalchemy-stubs.sh | 28 --------------- metadata-ingestion/setup.py | 25 ++++++------- .../source/datahub/datahub_database_reader.py | 6 +--- .../source/snowflake/snowflake_usage_v2.py | 9 +---- .../ingestion/source/sql/clickhouse.py | 35 +------------------ .../source/usage/clickhouse_usage.py | 6 +--- .../ingestion/source/usage/redshift_usage.py | 4 +-- .../source/usage/starburst_trino_usage.py | 6 +--- 10 files changed, 17 insertions(+), 106 deletions(-) delete mode 100755 metadata-ingestion/scripts/install-sqlalchemy-stubs.sh diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 1ef7413a88ebde..9b19291ee246ae 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -5,6 +5,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe ## Next ### Breaking Changes +- #8810 - Removed support for SQLAlchemy 1.3.x. Only SQLAlchemy 1.4.x is supported now. ### Potential Downtime diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle index 199ccc59c21e04..408ea771bc93f6 100644 --- a/metadata-ingestion/build.gradle +++ b/metadata-ingestion/build.gradle @@ -71,7 +71,6 @@ task installDev(type: Exec, dependsOn: [install]) { commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + "${venv_name}/bin/pip install -e .[dev] ${extra_pip_requirements} && " + - "./scripts/install-sqlalchemy-stubs.sh && " + "touch ${sentinel_file}" } @@ -82,7 +81,6 @@ task installAll(type: Exec, dependsOn: [install]) { commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + "${venv_name}/bin/pip install -e .[all] ${extra_pip_requirements} && " + - "./scripts/install-sqlalchemy-stubs.sh && " + "touch ${sentinel_file}" } @@ -119,7 +117,6 @@ task lint(type: Exec, dependsOn: installDev) { task lintFix(type: Exec, dependsOn: installDev) { commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + - "./scripts/install-sqlalchemy-stubs.sh && " + "black src/ tests/ examples/ && " + "isort src/ tests/ examples/ && " + "flake8 src/ tests/ examples/ && " + diff --git a/metadata-ingestion/scripts/install-sqlalchemy-stubs.sh b/metadata-ingestion/scripts/install-sqlalchemy-stubs.sh deleted file mode 100755 index 7c14a06464f99e..00000000000000 --- a/metadata-ingestion/scripts/install-sqlalchemy-stubs.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -# ASSUMPTION: This assumes that we're running from inside the venv. - -SQLALCHEMY_VERSION=$(python -c 'import sqlalchemy; print(sqlalchemy.__version__)') - -if [[ $SQLALCHEMY_VERSION == 1.3.* ]]; then - ENSURE_NOT_INSTALLED=sqlalchemy2-stubs - ENSURE_INSTALLED=sqlalchemy-stubs -elif [[ $SQLALCHEMY_VERSION == 1.4.* ]]; then - ENSURE_NOT_INSTALLED=sqlalchemy-stubs - ENSURE_INSTALLED=sqlalchemy2-stubs -else - echo "Unsupported SQLAlchemy version: $SQLALCHEMY_VERSION" - exit 1 -fi - -FORCE_REINSTALL="" -if pip show $ENSURE_NOT_INSTALLED >/dev/null 2>&1 ; then - pip uninstall --yes $ENSURE_NOT_INSTALLED - FORCE_REINSTALL="--force-reinstall" -fi - -if [ -n "$FORCE_REINSTALL" ] || ! pip show $ENSURE_INSTALLED >/dev/null 2>&1 ; then - pip install $FORCE_REINSTALL $ENSURE_INSTALLED -fi diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index d8668e89255468..09f71fa769fd37 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -112,7 +112,8 @@ def get_long_description(): sql_common = { # Required for all SQL sources. - "sqlalchemy>=1.3.24, <2", + # This is temporary lower bound that we're open to loosening/tightening as requirements show up + "sqlalchemy>=1.4.39, <2", # Required for SQL profiling. "great-expectations>=0.15.12, <=0.15.50", # scipy version restricted to reduce backtracking, used by great-expectations, @@ -172,13 +173,13 @@ def get_long_description(): } clickhouse_common = { - # Clickhouse 0.1.8 requires SQLAlchemy 1.3.x, while the newer versions - # allow SQLAlchemy 1.4.x. - "clickhouse-sqlalchemy>=0.1.8", + # Clickhouse 0.2.0 adds support for SQLAlchemy 1.4.x + "clickhouse-sqlalchemy>=0.2.0", } redshift_common = { - "sqlalchemy-redshift", + # Clickhouse 0.8.3 adds support for SQLAlchemy 1.4.x + "sqlalchemy-redshift>=0.8.3", "psycopg2-binary", "GeoAlchemy2", *sqllineage_lib, @@ -188,13 +189,8 @@ def get_long_description(): snowflake_common = { # Snowflake plugin utilizes sql common *sql_common, - # Required for all Snowflake sources. - # See https://github.com/snowflakedb/snowflake-sqlalchemy/issues/234 for why 1.2.5 is blocked. - "snowflake-sqlalchemy>=1.2.4, !=1.2.5", - # Because of https://github.com/snowflakedb/snowflake-sqlalchemy/issues/350 we need to restrict SQLAlchemy's max version. - # Eventually we should just require snowflake-sqlalchemy>=1.4.3, but I won't do that immediately - # because it may break Airflow users that need SQLAlchemy 1.3.x. - "SQLAlchemy<1.4.42", + # https://github.com/snowflakedb/snowflake-sqlalchemy/issues/350 + "snowflake-sqlalchemy>=1.4.3", # See https://github.com/snowflakedb/snowflake-connector-python/pull/1348 for why 2.8.2 is blocked "snowflake-connector-python!=2.8.2", "pandas", @@ -206,9 +202,7 @@ def get_long_description(): } trino = { - # Trino 0.317 broke compatibility with SQLAlchemy 1.3.24. - # See https://github.com/trinodb/trino-python-client/issues/250. - "trino[sqlalchemy]>=0.308, !=0.317", + "trino[sqlalchemy]>=0.308", } pyhive_common = { @@ -430,6 +424,7 @@ def get_long_description(): "types-Deprecated", "types-protobuf>=4.21.0.1", "types-tzlocal", + "sqlalchemy2-stubs", } diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py index a5aadbd6e246b4..96184d8d445e4e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py @@ -69,11 +69,7 @@ def get_aspects( return for i, row in enumerate(rows): - # TODO: Replace with namedtuple usage once we drop sqlalchemy 1.3 - if hasattr(row, "_asdict"): - row_dict = row._asdict() - else: - row_dict = dict(row) + row_dict = row._asdict() mcp = self._parse_row(row_dict) if mcp: yield mcp, row_dict["createdon"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py index d041d219c4bdd7..1cbd4a3b3ea244 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py @@ -451,17 +451,10 @@ def _get_operation_aspect_work_unit( yield wu def _process_snowflake_history_row( - self, row: Any + self, event_dict: dict ) -> Iterable[SnowflakeJoinedAccessEvent]: try: # big hammer try block to ensure we don't fail on parsing events self.report.rows_processed += 1 - # Make some minor type conversions. - if hasattr(row, "_asdict"): - # Compat with SQLAlchemy 1.3 and 1.4 - # See https://docs.sqlalchemy.org/en/14/changelog/migration_14.html#rowproxy-is-no-longer-a-proxy-is-now-called-row-and-behaves-like-an-enhanced-named-tuple. - event_dict = row._asdict() - else: - event_dict = dict(row) # no use processing events that don't have a query text if not event_dict["QUERY_TEXT"]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py b/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py index 20130ef21e5e6b..1626f86b92545c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py @@ -38,7 +38,6 @@ logger, register_custom_type, ) -from datahub.ingestion.source.sql.sql_config import make_sqlalchemy_uri from datahub.ingestion.source.sql.two_tier_sql_source import ( TwoTierSQLAlchemyConfig, TwoTierSQLAlchemySource, @@ -147,7 +146,6 @@ class ClickHouseConfig( include_materialized_views: Optional[bool] = Field(default=True, description="") def get_sql_alchemy_url(self, current_db=None): - url = make_url( super().get_sql_alchemy_url(uri_opts=self.uri_opts, current_db=current_db) ) @@ -158,42 +156,11 @@ def get_sql_alchemy_url(self, current_db=None): ) # We can setup clickhouse ingestion in sqlalchemy_uri form and config form. - - # If we use sqlalchemu_uri form then super().get_sql_alchemy_url doesn't - # update current_db because it return self.sqlalchemy_uri without any update. - # This code bellow needed for rewriting sqlalchemi_uri and replace database with current_db.from - # For the future without python3.7 and sqlalchemy 1.3 support we can use code - # url=url.set(db=current_db), but not now. - # Why we need to update database in uri at all? # Because we get database from sqlalchemy inspector and inspector we form from url inherited from # TwoTierSQLAlchemySource and SQLAlchemySource - if self.sqlalchemy_uri and current_db: - self.scheme = url.drivername - self.username = url.username - self.password = ( - pydantic.SecretStr(str(url.password)) - if url.password - else pydantic.SecretStr("") - ) - if url.host and url.port: - self.host_port = url.host + ":" + str(url.port) - elif url.host: - self.host_port = url.host - # untill released https://github.com/python/mypy/pull/15174 - self.uri_opts = {str(k): str(v) for (k, v) in url.query.items()} - - url = make_url( - make_sqlalchemy_uri( - self.scheme, - self.username, - self.password.get_secret_value() if self.password else None, - self.host_port, - current_db if current_db else self.database, - uri_opts=self.uri_opts, - ) - ) + url = url.set(database=current_db) return str(url) diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py index 855958f0755e1e..f659ea0c1c5c0e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py @@ -143,11 +143,7 @@ def _get_clickhouse_history(self): results = engine.execute(query) events = [] for row in results: - # minor type conversion - if hasattr(row, "_asdict"): - event_dict = row._asdict() - else: - event_dict = dict(row) + event_dict = row._asdict() # stripping extra spaces caused by above _asdict() conversion for k, v in event_dict.items(): diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py index 99a980b326e531..691eaa8211054c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py @@ -298,9 +298,7 @@ def _gen_access_events_from_history_query( for row in results: if not self._should_process_row(row): continue - if hasattr(row, "_asdict"): - # Compatibility with sqlalchemy 1.4.x. - row = row._asdict() + row = row._asdict() access_event = RedshiftAccessEvent(**dict(row.items())) # Replace database name with the alias name if one is provided in the config. if self.config.database_alias: diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py index 9394a8bba5e0b6..c38800b3a69838 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py @@ -162,11 +162,7 @@ def _get_trino_history(self): results = engine.execute(query) events = [] for row in results: - # minor type conversion - if hasattr(row, "_asdict"): - event_dict = row._asdict() - else: - event_dict = dict(row) + event_dict = row._asdict() # stripping extra spaces caused by above _asdict() conversion for k, v in event_dict.items(): From f7fee743bfddf27f072e5c56512ef905d942eab6 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 12 Sep 2023 13:11:01 -0700 Subject: [PATCH 09/65] fix(ingest): use epoch 1 for dev build versions (#8824) --- docker/datahub-ingestion-base/smoke.Dockerfile | 2 +- docker/datahub-ingestion/Dockerfile | 4 ++-- docker/datahub-ingestion/Dockerfile-slim-only | 2 +- metadata-ingestion-modules/airflow-plugin/scripts/release.sh | 2 +- .../airflow-plugin/src/datahub_airflow_plugin/__init__.py | 2 +- metadata-ingestion/scripts/release.sh | 2 +- metadata-ingestion/src/datahub/__init__.py | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docker/datahub-ingestion-base/smoke.Dockerfile b/docker/datahub-ingestion-base/smoke.Dockerfile index 276f6dbc4436e2..15dc46ae5b882a 100644 --- a/docker/datahub-ingestion-base/smoke.Dockerfile +++ b/docker/datahub-ingestion-base/smoke.Dockerfile @@ -20,7 +20,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get install -y openjdk-11-jdk COPY . /datahub-src ARG RELEASE_VERSION RUN cd /datahub-src/metadata-ingestion && \ - sed -i.bak "s/__version__ = \"0.0.0.dev0\"/__version__ = \"$RELEASE_VERSION\"/" src/datahub/__init__.py && \ + sed -i.bak "s/__version__ = \"1!0.0.0.dev0\"/__version__ = \"$RELEASE_VERSION\"/" src/datahub/__init__.py && \ cat src/datahub/__init__.py && \ cd ../ && \ ./gradlew :metadata-ingestion:installAll diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index 2ceff6a800ebbb..8b726df5e88420 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -11,8 +11,8 @@ COPY ./metadata-ingestion-modules/airflow-plugin /datahub-ingestion/airflow-plug ARG RELEASE_VERSION WORKDIR /datahub-ingestion -RUN sed -i.bak "s/__version__ = \"0.0.0.dev0\"/__version__ = \"$RELEASE_VERSION\"/" src/datahub/__init__.py && \ - sed -i.bak "s/__version__ = \"0.0.0.dev0\"/__version__ = \"$RELEASE_VERSION\"/" airflow-plugin/src/datahub_airflow_plugin/__init__.py && \ +RUN sed -i.bak "s/__version__ = \"1!0.0.0.dev0\"/__version__ = \"$RELEASE_VERSION\"/" src/datahub/__init__.py && \ + sed -i.bak "s/__version__ = \"1!0.0.0.dev0\"/__version__ = \"$RELEASE_VERSION\"/" airflow-plugin/src/datahub_airflow_plugin/__init__.py && \ cat src/datahub/__init__.py && \ chown -R datahub /datahub-ingestion diff --git a/docker/datahub-ingestion/Dockerfile-slim-only b/docker/datahub-ingestion/Dockerfile-slim-only index 678bee7e306f67..9ae116f839aa07 100644 --- a/docker/datahub-ingestion/Dockerfile-slim-only +++ b/docker/datahub-ingestion/Dockerfile-slim-only @@ -9,7 +9,7 @@ COPY ./metadata-ingestion /datahub-ingestion ARG RELEASE_VERSION WORKDIR /datahub-ingestion -RUN sed -i.bak "s/__version__ = \"0.0.0.dev0\"/__version__ = \"$RELEASE_VERSION\"/" src/datahub/__init__.py && \ +RUN sed -i.bak "s/__version__ = \"1!0.0.0.dev0\"/__version__ = \"$RELEASE_VERSION\"/" src/datahub/__init__.py && \ cat src/datahub/__init__.py && \ chown -R datahub /datahub-ingestion diff --git a/metadata-ingestion-modules/airflow-plugin/scripts/release.sh b/metadata-ingestion-modules/airflow-plugin/scripts/release.sh index 7134187a458850..87157479f37d63 100755 --- a/metadata-ingestion-modules/airflow-plugin/scripts/release.sh +++ b/metadata-ingestion-modules/airflow-plugin/scripts/release.sh @@ -13,7 +13,7 @@ MODULE=datahub_airflow_plugin python -c 'import setuptools; where="./src"; assert setuptools.find_packages(where) == setuptools.find_namespace_packages(where), "you seem to be missing or have extra __init__.py files"' if [[ ${RELEASE_VERSION:-} ]]; then # Replace version with RELEASE_VERSION env variable - sed -i.bak "s/__version__ = \"0.0.0.dev0\"/__version__ = \"$RELEASE_VERSION\"/" src/${MODULE}/__init__.py + sed -i.bak "s/__version__ = \"1!0.0.0.dev0\"/__version__ = \"$RELEASE_VERSION\"/" src/${MODULE}/__init__.py else vim src/${MODULE}/__init__.py fi diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/__init__.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/__init__.py index ce98a0fc1fb609..b2c45d3a1e75d3 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/__init__.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/__init__.py @@ -1,6 +1,6 @@ # Published at https://pypi.org/project/acryl-datahub/. __package_name__ = "acryl-datahub-airflow-plugin" -__version__ = "0.0.0.dev0" +__version__ = "1!0.0.0.dev0" def is_dev_mode() -> bool: diff --git a/metadata-ingestion/scripts/release.sh b/metadata-ingestion/scripts/release.sh index 0a09c4e0307b33..eacaf1d920a8d2 100755 --- a/metadata-ingestion/scripts/release.sh +++ b/metadata-ingestion/scripts/release.sh @@ -11,7 +11,7 @@ fi python -c 'import setuptools; where="./src"; assert setuptools.find_packages(where) == setuptools.find_namespace_packages(where), "you seem to be missing or have extra __init__.py files"' if [[ ${RELEASE_VERSION:-} ]]; then # Replace version with RELEASE_VERSION env variable - sed -i.bak "s/__version__ = \"0.0.0.dev0\"/__version__ = \"$RELEASE_VERSION\"/" src/datahub/__init__.py + sed -i.bak "s/__version__ = \"1!0.0.0.dev0\"/__version__ = \"$RELEASE_VERSION\"/" src/datahub/__init__.py else vim src/datahub/__init__.py fi diff --git a/metadata-ingestion/src/datahub/__init__.py b/metadata-ingestion/src/datahub/__init__.py index 3ac3efefc14f06..a470de7b500be3 100644 --- a/metadata-ingestion/src/datahub/__init__.py +++ b/metadata-ingestion/src/datahub/__init__.py @@ -3,7 +3,7 @@ # Published at https://pypi.org/project/acryl-datahub/. __package_name__ = "acryl-datahub" -__version__ = "0.0.0.dev0" +__version__ = "1!0.0.0.dev0" def is_dev_mode() -> bool: From 449cc9ba91bfc51bc8e5a66de7920340f164f272 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 12 Sep 2023 13:15:05 -0700 Subject: [PATCH 10/65] ci: make wheel builds more robust (#8815) --- docs-website/sphinx/Makefile | 5 ++++- docs-website/sphinx/requirements.txt | 2 +- docs-website/yarn.lock | 18 +++++++++++------- .../airflow-plugin/build.gradle | 6 +++--- metadata-ingestion/build.gradle | 6 +++--- 5 files changed, 22 insertions(+), 15 deletions(-) diff --git a/docs-website/sphinx/Makefile b/docs-website/sphinx/Makefile index 00ece7ae253317..c01b45e322c679 100644 --- a/docs-website/sphinx/Makefile +++ b/docs-website/sphinx/Makefile @@ -22,7 +22,7 @@ $(VENV_SENTINEL): requirements.txt $(VENV_DIR)/bin/pip install -r requirements.txt touch $(VENV_SENTINEL) -.PHONY: help html doctest linkcheck clean serve md +.PHONY: help html doctest linkcheck clean clean_all serve md # Not using Python's http.server because it enables caching headers. serve: @@ -35,3 +35,6 @@ md: html # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). html doctest linkcheck clean: venv Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +clean_all: clean + -rm -rf $(VENV_DIR) diff --git a/docs-website/sphinx/requirements.txt b/docs-website/sphinx/requirements.txt index a63fd058532599..94ddd40579f0e7 100644 --- a/docs-website/sphinx/requirements.txt +++ b/docs-website/sphinx/requirements.txt @@ -1,4 +1,4 @@ --e ../../metadata-ingestion[datahub-rest] +-e ../../metadata-ingestion[datahub-rest,sql-parsing] beautifulsoup4==4.11.2 Sphinx==6.1.3 sphinx-click==4.4.0 diff --git a/docs-website/yarn.lock b/docs-website/yarn.lock index 209a57a43dab03..0613fe71ef78ee 100644 --- a/docs-website/yarn.lock +++ b/docs-website/yarn.lock @@ -2986,6 +2986,13 @@ dependencies: "@types/node" "*" +"@types/websocket@^1.0.3": + version "1.0.6" + resolved "https://registry.yarnpkg.com/@types/websocket/-/websocket-1.0.6.tgz#ec8dce5915741632ac3a4b1f951b6d4156e32d03" + integrity sha512-JXkliwz93B2cMWOI1ukElQBPN88vMg3CruvW4KVSKpflt3NyNCJImnhIuB/f97rG7kakqRJGFiwkA895Kn02Dg== + dependencies: + "@types/node" "*" + "@types/ws@^8.5.5": version "8.5.5" resolved "https://registry.yarnpkg.com/@types/ws/-/ws-8.5.5.tgz#af587964aa06682702ee6dcbc7be41a80e4b28eb" @@ -7053,7 +7060,6 @@ node-forge@^1: resolved "https://registry.yarnpkg.com/node-forge/-/node-forge-1.3.1.tgz#be8da2af243b2417d5f646a770663a92b7e9ded3" integrity sha512-dPEtOeMvF9VMcYV/1Wb8CPoVAXtp6MKMlcbAt4ddqmGqUJ6fQZFXkNZNkNlfevtNkGtaSoXf/vNNNSvgrdXwtA== - node-gyp-build@^4.3.0: version "4.6.1" resolved "https://registry.yarnpkg.com/node-gyp-build/-/node-gyp-build-4.6.1.tgz#24b6d075e5e391b8d5539d98c7fc5c210cac8a3e" @@ -9903,6 +9909,10 @@ use-sidecar@^1.1.2: detect-node-es "^1.1.0" tslib "^2.0.0" +use-sync-external-store@^1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/use-sync-external-store/-/use-sync-external-store-1.2.0.tgz#7dbefd6ef3fe4e767a0cf5d7287aacfb5846928a" + integrity sha512-eEgnFxGQ1Ife9bzYs6VLi8/4X6CObHMw9Qr9tPY43iKwsPw8xE8+EFsf/2cFZ5S3esXgpWgtSCtLNS41F+sKPA== utf-8-validate@^5.0.2: version "5.0.10" @@ -9911,12 +9921,6 @@ utf-8-validate@^5.0.2: dependencies: node-gyp-build "^4.3.0" -use-sync-external-store@^1.2.0: - version "1.2.0" - resolved "https://registry.yarnpkg.com/use-sync-external-store/-/use-sync-external-store-1.2.0.tgz#7dbefd6ef3fe4e767a0cf5d7287aacfb5846928a" - integrity sha512-eEgnFxGQ1Ife9bzYs6VLi8/4X6CObHMw9Qr9tPY43iKwsPw8xE8+EFsf/2cFZ5S3esXgpWgtSCtLNS41F+sKPA== - - util-deprecate@^1.0.1, util-deprecate@^1.0.2, util-deprecate@~1.0.1: version "1.0.2" resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf" diff --git a/metadata-ingestion-modules/airflow-plugin/build.gradle b/metadata-ingestion-modules/airflow-plugin/build.gradle index d1e6f2f6464914..58a2bc9e670e34 100644 --- a/metadata-ingestion-modules/airflow-plugin/build.gradle +++ b/metadata-ingestion-modules/airflow-plugin/build.gradle @@ -110,14 +110,14 @@ task testFull(type: Exec, dependsOn: [testQuick, installDevTest]) { commandLine 'bash', '-x', '-c', "source ${venv_name}/bin/activate && pytest -m 'not slow_integration' -vv --continue-on-collection-errors --junit-xml=junit.full.xml" } -task buildWheel(type: Exec, dependsOn: [install]) { - commandLine 'bash', '-c', "source ${venv_name}/bin/activate && " + 'pip install build && RELEASE_VERSION="\${RELEASE_VERSION:-0.0.0.dev1}" RELEASE_SKIP_TEST=1 RELEASE_SKIP_UPLOAD=1 ./scripts/release.sh' -} task cleanPythonCache(type: Exec) { commandLine 'bash', '-c', "find src -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete -o -type d -empty -delete" } +task buildWheel(type: Exec, dependsOn: [install, cleanPythonCache]) { + commandLine 'bash', '-c', "source ${venv_name}/bin/activate && " + 'pip install build && RELEASE_VERSION="\${RELEASE_VERSION:-0.0.0.dev1}" RELEASE_SKIP_TEST=1 RELEASE_SKIP_UPLOAD=1 ./scripts/release.sh' +} build.dependsOn install check.dependsOn lint diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle index 408ea771bc93f6..c20d98cbcbb58b 100644 --- a/metadata-ingestion/build.gradle +++ b/metadata-ingestion/build.gradle @@ -185,9 +185,6 @@ task specGen(type: Exec, dependsOn: [codegen, installDevTest]) { task docGen(type: Exec, dependsOn: [codegen, installDevTest, specGen]) { commandLine 'bash', '-c', "source ${venv_name}/bin/activate && ./scripts/docgen.sh" } -task buildWheel(type: Exec, dependsOn: [install, codegen]) { - commandLine 'bash', '-c', "source ${venv_name}/bin/activate && " + 'pip install build && RELEASE_VERSION="\${RELEASE_VERSION:-0.0.0.dev1}" RELEASE_SKIP_TEST=1 RELEASE_SKIP_UPLOAD=1 ./scripts/release.sh' -} @@ -195,6 +192,9 @@ task cleanPythonCache(type: Exec) { commandLine 'bash', '-c', "find src tests -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete -o -type d -empty -delete" } +task buildWheel(type: Exec, dependsOn: [install, codegen, cleanPythonCache]) { + commandLine 'bash', '-c', "source ${venv_name}/bin/activate && " + 'pip install build && RELEASE_VERSION="\${RELEASE_VERSION:-0.0.0.dev1}" RELEASE_SKIP_TEST=1 RELEASE_SKIP_UPLOAD=1 ./scripts/release.sh' +} build.dependsOn install check.dependsOn lint From 138f6c0f74a4799d31560e9fde19ef6011089990 Mon Sep 17 00:00:00 2001 From: Pedro Silva Date: Tue, 12 Sep 2023 22:26:30 +0100 Subject: [PATCH 11/65] feat(cli): fix upload ingest cli endpoint (#8826) --- metadata-ingestion/src/datahub/cli/ingest_cli.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/cli/ingest_cli.py b/metadata-ingestion/src/datahub/cli/ingest_cli.py index 42c0ea1601c74d..5931bf89b010b5 100644 --- a/metadata-ingestion/src/datahub/cli/ingest_cli.py +++ b/metadata-ingestion/src/datahub/cli/ingest_cli.py @@ -282,12 +282,14 @@ def deploy( "urn": urn, "name": name, "type": pipeline_config["source"]["type"], - "schedule": {"interval": schedule, "timezone": time_zone}, "recipe": json.dumps(pipeline_config), "executorId": executor_id, "version": cli_version, } + if schedule is not None: + variables["schedule"] = {"interval": schedule, "timezone": time_zone} + if urn: if not datahub_graph.exists(urn): logger.error(f"Could not find recipe for provided urn: {urn}") @@ -331,6 +333,7 @@ def deploy( $version: String) { createIngestionSource(input: { + name: $name, type: $type, schedule: $schedule, config: { From 3cc0f76d178f239acc018e06ec408eb6b38bfb5d Mon Sep 17 00:00:00 2001 From: Adriano Vega Llobell Date: Tue, 12 Sep 2023 23:34:24 +0200 Subject: [PATCH 12/65] docs(transformer): fix names in sample code of 'pattern_add_dataset_domain' (#8755) --- metadata-ingestion/docs/transformer/dataset_transformer.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/docs/transformer/dataset_transformer.md b/metadata-ingestion/docs/transformer/dataset_transformer.md index cb06656940918d..f0fa44687a1096 100644 --- a/metadata-ingestion/docs/transformer/dataset_transformer.md +++ b/metadata-ingestion/docs/transformer/dataset_transformer.md @@ -909,7 +909,7 @@ in both of the cases domain should be provisioned on DataHub GMS - Add domains, however replace existing domains sent by ingestion source ```yaml transformers: - - type: "pattern_add_dataset_ownership" + - type: "pattern_add_dataset_domain" config: replace_existing: true # false is default behaviour domain_pattern: @@ -920,7 +920,7 @@ in both of the cases domain should be provisioned on DataHub GMS - Add domains, however overwrite the domains available for the dataset on DataHub GMS ```yaml transformers: - - type: "pattern_add_dataset_ownership" + - type: "pattern_add_dataset_domain" config: semantics: OVERWRITE # OVERWRITE is default behaviour domain_pattern: @@ -931,7 +931,7 @@ in both of the cases domain should be provisioned on DataHub GMS - Add domains, however keep the domains available for the dataset on DataHub GMS ```yaml transformers: - - type: "pattern_add_dataset_ownership" + - type: "pattern_add_dataset_domain" config: semantics: PATCH domain_pattern: From 785ab7718df8e4e46bdd612ed3deaafbda1d42cc Mon Sep 17 00:00:00 2001 From: ethan-cartwright Date: Wed, 13 Sep 2023 03:45:58 -0400 Subject: [PATCH 13/65] fix(siblingsHook): check number of dbtUpstreams instead of all upStreams (#8817) Co-authored-by: Ethan Cartwright --- .../hook/siblings/SiblingAssociationHook.java | 19 ++- .../siblings/SiblingAssociationHookTest.java | 112 ++++++++++++++---- 2 files changed, 100 insertions(+), 31 deletions(-) diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/siblings/SiblingAssociationHook.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/siblings/SiblingAssociationHook.java index 2be719ed263ea5..06545ef3525dd6 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/siblings/SiblingAssociationHook.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/siblings/SiblingAssociationHook.java @@ -200,10 +200,19 @@ private void handleSourceDatasetEvent(MetadataChangeLog event, DatasetUrn source UpstreamLineage upstreamLineage = getUpstreamLineageFromEvent(event); if (upstreamLineage != null && upstreamLineage.hasUpstreams()) { UpstreamArray upstreams = upstreamLineage.getUpstreams(); - if ( - upstreams.size() == 1 - && upstreams.get(0).getDataset().getPlatformEntity().getPlatformNameEntity().equals(DBT_PLATFORM_NAME)) { - setSiblingsAndSoftDeleteSibling(upstreams.get(0).getDataset(), sourceUrn); + + // an entity can have merged lineage (eg. dbt + snowflake), but by default siblings are only between dbt <> non-dbt + UpstreamArray dbtUpstreams = new UpstreamArray( + upstreams.stream() + .filter(obj -> obj.getDataset().getPlatformEntity().getPlatformNameEntity().equals(DBT_PLATFORM_NAME)) + .collect(Collectors.toList()) + ); + // We're assuming a data asset (eg. snowflake table) will only ever be downstream of 1 dbt model + if (dbtUpstreams.size() == 1) { + setSiblingsAndSoftDeleteSibling(dbtUpstreams.get(0).getDataset(), sourceUrn); + } else { + log.error("{} has an unexpected number of dbt upstreams: {}. Not adding any as siblings.", sourceUrn.toString(), dbtUpstreams.size()); + } } } @@ -219,7 +228,7 @@ private void setSiblingsAndSoftDeleteSibling(Urn dbtUrn, Urn sourceUrn) { existingDbtSiblingAspect != null && existingSourceSiblingAspect != null && existingDbtSiblingAspect.getSiblings().contains(sourceUrn.toString()) - && existingDbtSiblingAspect.getSiblings().contains(dbtUrn.toString()) + && existingSourceSiblingAspect.getSiblings().contains(dbtUrn.toString()) ) { // we have already connected them- we can abort here return; diff --git a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/siblings/SiblingAssociationHookTest.java b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/siblings/SiblingAssociationHookTest.java index 5fb2cfaaef2d11..78d304d67bfc09 100644 --- a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/siblings/SiblingAssociationHookTest.java +++ b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/siblings/SiblingAssociationHookTest.java @@ -36,6 +36,8 @@ import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; +import java.net.URISyntaxException; + import static com.linkedin.metadata.Constants.*; import static org.mockito.ArgumentMatchers.*; @@ -78,15 +80,12 @@ public void testInvokeWhenThereIsAPairWithDbtSourceNode() throws Exception { _mockAuthentication )).thenReturn(mockResponse); - MetadataChangeLog event = new MetadataChangeLog(); - event.setEntityType(DATASET_ENTITY_NAME); - event.setAspectName(UPSTREAM_LINEAGE_ASPECT_NAME); - event.setChangeType(ChangeType.UPSERT); + + MetadataChangeLog event = createEvent(DATASET_ENTITY_NAME, UPSTREAM_LINEAGE_ASPECT_NAME, ChangeType.UPSERT); + + Upstream upstream = createUpstream("urn:li:dataset:(urn:li:dataPlatform:bigquery,my-proj.jaffle_shop.customers,PROD)", DatasetLineageType.TRANSFORMED); final UpstreamLineage upstreamLineage = new UpstreamLineage(); final UpstreamArray upstreamArray = new UpstreamArray(); - final Upstream upstream = new Upstream(); - upstream.setType(DatasetLineageType.TRANSFORMED); - upstream.setDataset(DatasetUrn.createFromString("urn:li:dataset:(urn:li:dataPlatform:bigquery,my-proj.jaffle_shop.customers,PROD)")); upstreamArray.add(upstream); upstreamLineage.setUpstreams(upstreamArray); @@ -151,15 +150,11 @@ public void testInvokeWhenThereIsNoPairWithDbtModel() throws Exception { _mockAuthentication )).thenReturn(mockResponse); - MetadataChangeLog event = new MetadataChangeLog(); - event.setEntityType(DATASET_ENTITY_NAME); - event.setAspectName(UPSTREAM_LINEAGE_ASPECT_NAME); - event.setChangeType(ChangeType.UPSERT); + MetadataChangeLog event = createEvent(DATASET_ENTITY_NAME, UPSTREAM_LINEAGE_ASPECT_NAME, ChangeType.UPSERT); + Upstream upstream = createUpstream("urn:li:dataset:(urn:li:dataPlatform:bigquery,my-proj.jaffle_shop.customers,PROD)", DatasetLineageType.TRANSFORMED); + final UpstreamLineage upstreamLineage = new UpstreamLineage(); final UpstreamArray upstreamArray = new UpstreamArray(); - final Upstream upstream = new Upstream(); - upstream.setType(DatasetLineageType.TRANSFORMED); - upstream.setDataset(DatasetUrn.createFromString("urn:li:dataset:(urn:li:dataPlatform:bigquery,my-proj.jaffle_shop.customers,PROD)")); upstreamArray.add(upstream); upstreamLineage.setUpstreams(upstreamArray); @@ -189,15 +184,11 @@ public void testInvokeWhenThereIsNoPairWithDbtModel() throws Exception { public void testInvokeWhenThereIsAPairWithBigqueryDownstreamNode() throws Exception { Mockito.when(_mockEntityClient.exists(Mockito.any(), Mockito.any())).thenReturn(true); - MetadataChangeLog event = new MetadataChangeLog(); - event.setEntityType(DATASET_ENTITY_NAME); - event.setAspectName(UPSTREAM_LINEAGE_ASPECT_NAME); - event.setChangeType(ChangeType.UPSERT); + + MetadataChangeLog event = createEvent(DATASET_ENTITY_NAME, UPSTREAM_LINEAGE_ASPECT_NAME, ChangeType.UPSERT); final UpstreamLineage upstreamLineage = new UpstreamLineage(); final UpstreamArray upstreamArray = new UpstreamArray(); - final Upstream upstream = new Upstream(); - upstream.setType(DatasetLineageType.TRANSFORMED); - upstream.setDataset(DatasetUrn.createFromString("urn:li:dataset:(urn:li:dataPlatform:dbt,my-proj.jaffle_shop.customers,PROD)")); + Upstream upstream = createUpstream("urn:li:dataset:(urn:li:dataPlatform:dbt,my-proj.jaffle_shop.customers,PROD)", DatasetLineageType.TRANSFORMED); upstreamArray.add(upstream); upstreamLineage.setUpstreams(upstreamArray); @@ -259,10 +250,7 @@ public void testInvokeWhenThereIsAKeyBeingReingested() throws Exception { .setSkipAggregates(true).setSkipHighlighting(true)) )).thenReturn(returnSearchResult); - MetadataChangeLog event = new MetadataChangeLog(); - event.setEntityType(DATASET_ENTITY_NAME); - event.setAspectName(DATASET_KEY_ASPECT_NAME); - event.setChangeType(ChangeType.UPSERT); + MetadataChangeLog event = createEvent(DATASET_ENTITY_NAME, DATASET_KEY_ASPECT_NAME, ChangeType.UPSERT); final DatasetKey datasetKey = new DatasetKey(); datasetKey.setName("my-proj.jaffle_shop.customers"); datasetKey.setOrigin(FabricType.PROD); @@ -304,4 +292,76 @@ public void testInvokeWhenThereIsAKeyBeingReingested() throws Exception { Mockito.eq(_mockAuthentication) ); } -} + @Test + public void testInvokeWhenSourceUrnHasTwoDbtUpstreams() throws Exception { + + MetadataChangeLog event = createEvent(DATASET_ENTITY_NAME, UPSTREAM_LINEAGE_ASPECT_NAME, ChangeType.UPSERT); + final UpstreamLineage upstreamLineage = new UpstreamLineage(); + final UpstreamArray upstreamArray = new UpstreamArray(); + Upstream dbtUpstream1 = createUpstream("urn:li:dataset:(urn:li:dataPlatform:dbt,my-proj.source_entity1,PROD)", DatasetLineageType.TRANSFORMED); + Upstream dbtUpstream2 = createUpstream("urn:li:dataset:(urn:li:dataPlatform:dbt,my-proj.source_entity2,PROD)", DatasetLineageType.TRANSFORMED); + upstreamArray.add(dbtUpstream1); + upstreamArray.add(dbtUpstream2); + upstreamLineage.setUpstreams(upstreamArray); + + event.setAspect(GenericRecordUtils.serializeAspect(upstreamLineage)); + event.setEntityUrn(Urn.createFromString("urn:li:dataset:(urn:li:dataPlatform:bigquery,my-proj.jaffle_shop.customers,PROD)")); + _siblingAssociationHook.invoke(event); + + + Mockito.verify(_mockEntityClient, Mockito.times(0)).ingestProposal( + Mockito.any(), + Mockito.eq(_mockAuthentication) + ); + + + } + + @Test + public void testInvokeWhenSourceUrnHasTwoUpstreamsOneDbt() throws Exception { + + MetadataChangeLog event = createEvent(DATASET_ENTITY_NAME, UPSTREAM_LINEAGE_ASPECT_NAME, ChangeType.UPSERT); + final UpstreamLineage upstreamLineage = new UpstreamLineage(); + final UpstreamArray upstreamArray = new UpstreamArray(); + Upstream dbtUpstream = createUpstream("urn:li:dataset:(urn:li:dataPlatform:dbt,my-proj.source_entity1,PROD)", DatasetLineageType.TRANSFORMED); + Upstream snowflakeUpstream = + createUpstream("urn:li:dataset:(urn:li:dataPlatform:snowflake,my-proj.jaffle_shop.customers,PROD)", DatasetLineageType.TRANSFORMED); + upstreamArray.add(dbtUpstream); + upstreamArray.add(snowflakeUpstream); + upstreamLineage.setUpstreams(upstreamArray); + + event.setAspect(GenericRecordUtils.serializeAspect(upstreamLineage)); + event.setEntityUrn(Urn.createFromString("urn:li:dataset:(urn:li:dataPlatform:bigquery,my-proj.jaffle_shop.customers,PROD)")); + _siblingAssociationHook.invoke(event); + + + Mockito.verify(_mockEntityClient, Mockito.times(2)).ingestProposal( + Mockito.any(), + Mockito.eq(_mockAuthentication) + ); + + + } + + private MetadataChangeLog createEvent(String entityType, String aspectName, ChangeType changeType) { + MetadataChangeLog event = new MetadataChangeLog(); + event.setEntityType(entityType); + event.setAspectName(aspectName); + event.setChangeType(changeType); + return event; + } + private Upstream createUpstream(String urn, DatasetLineageType upstreamType) { + + final Upstream upstream = new Upstream(); + upstream.setType(upstreamType); + try { + upstream.setDataset(DatasetUrn.createFromString(urn)); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + + return upstream; + } + + + } From e9b4727c8e270d22c80c4be7133a3315adbc5691 Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Wed, 13 Sep 2023 11:18:52 -0400 Subject: [PATCH 14/65] fix(java) Update DataProductMapper to always return a name (#8832) --- .../types/dataproduct/mappers/DataProductMapper.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataproduct/mappers/DataProductMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataproduct/mappers/DataProductMapper.java index 9cb6840067e7b8..254b43ecb96ccb 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataproduct/mappers/DataProductMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataproduct/mappers/DataProductMapper.java @@ -50,7 +50,8 @@ public DataProduct apply(@Nonnull final EntityResponse entityResponse) { EnvelopedAspectMap aspectMap = entityResponse.getAspects(); MappingHelper mappingHelper = new MappingHelper<>(aspectMap, result); - mappingHelper.mapToResult(DATA_PRODUCT_PROPERTIES_ASPECT_NAME, this::mapDataProductProperties); + mappingHelper.mapToResult(DATA_PRODUCT_PROPERTIES_ASPECT_NAME, (dataProduct, dataMap) -> + mapDataProductProperties(dataProduct, dataMap, entityUrn)); mappingHelper.mapToResult(GLOBAL_TAGS_ASPECT_NAME, (dataProduct, dataMap) -> dataProduct.setTags(GlobalTagsMapper.map(new GlobalTags(dataMap), entityUrn))); mappingHelper.mapToResult(GLOSSARY_TERMS_ASPECT_NAME, (dataProduct, dataMap) -> @@ -65,11 +66,12 @@ public DataProduct apply(@Nonnull final EntityResponse entityResponse) { return result; } - private void mapDataProductProperties(@Nonnull DataProduct dataProduct, @Nonnull DataMap dataMap) { + private void mapDataProductProperties(@Nonnull DataProduct dataProduct, @Nonnull DataMap dataMap, @Nonnull Urn urn) { DataProductProperties dataProductProperties = new DataProductProperties(dataMap); com.linkedin.datahub.graphql.generated.DataProductProperties properties = new com.linkedin.datahub.graphql.generated.DataProductProperties(); - properties.setName(dataProductProperties.getName()); + final String name = dataProductProperties.hasName() ? dataProductProperties.getName() : urn.getId(); + properties.setName(name); properties.setDescription(dataProductProperties.getDescription()); if (dataProductProperties.hasExternalUrl()) { properties.setExternalUrl(dataProductProperties.getExternalUrl().toString()); From 1474ac01b19f47d1011dc836f0fceeb59bd1720d Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Wed, 13 Sep 2023 12:32:45 -0700 Subject: [PATCH 15/65] build(ingest): Bump jsonschema for Python >= 3.8 (#8836) --- metadata-ingestion/setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 09f71fa769fd37..7a5fd355803cb4 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -58,7 +58,8 @@ def get_long_description(): "requests_file", "jsonref", # jsonschema drops python 3.7 support in v4.18.0 - "jsonschema<=4.17.3", + "jsonschema<=4.17.3 ; python_version < '3.8'", + "jsonschema>=4.18.0 ; python_version >= '3.8'", "ruamel.yaml", } From 493d31531a1ed829adc106ea7722c88c50b70270 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Wed, 13 Sep 2023 14:00:58 -0700 Subject: [PATCH 16/65] feat(ingest/rest-emitter): Do not raise error on retry failure to get better error messages (#8837) --- metadata-ingestion/src/datahub/emitter/rest_emitter.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/metadata-ingestion/src/datahub/emitter/rest_emitter.py b/metadata-ingestion/src/datahub/emitter/rest_emitter.py index acb57632809059..937e0902d6d8c7 100644 --- a/metadata-ingestion/src/datahub/emitter/rest_emitter.py +++ b/metadata-ingestion/src/datahub/emitter/rest_emitter.py @@ -120,11 +120,15 @@ def __init__( self._retry_max_times = retry_max_times try: + # Set raise_on_status to False to propagate errors: + # https://stackoverflow.com/questions/70189330/determine-status-code-from-python-retry-exception + # Must call `raise_for_status` after making a request, which we do retry_strategy = Retry( total=self._retry_max_times, status_forcelist=self._retry_status_codes, backoff_factor=2, allowed_methods=self._retry_methods, + raise_on_status=False, ) except TypeError: # Prior to urllib3 1.26, the Retry class used `method_whitelist` instead of `allowed_methods`. @@ -133,6 +137,7 @@ def __init__( status_forcelist=self._retry_status_codes, backoff_factor=2, method_whitelist=self._retry_methods, + raise_on_status=False, ) adapter = HTTPAdapter( From 31abf383d13538cdb2fdb3b89ca3ca1fe6b1590f Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Thu, 14 Sep 2023 11:34:21 +0900 Subject: [PATCH 17/65] ci: add markdown-link-check (#8771) --- README.md | 6 +- docs-website/build.gradle | 2 +- docs-website/markdown-link-check-config.json | 50 +++++++ docs-website/package.json | 3 +- docs-website/yarn.lock | 122 ++++++++++++++++-- docs/advanced/no-code-modeling.md | 7 +- docs/api/graphql/how-to-set-up-graphql.md | 2 +- docs/architecture/architecture.md | 2 +- docs/authentication/guides/add-users.md | 8 +- .../guides/sso/configure-oidc-react.md | 2 +- docs/cli.md | 2 +- docs/domains.md | 19 ++- docs/how/add-new-aspect.md | 10 +- docs/modeling/extending-the-metadata-model.md | 10 +- docs/modeling/metadata-model.md | 4 +- docs/tags.md | 10 +- docs/townhall-history.md | 2 +- docs/what/gms.md | 4 +- docs/what/mxe.md | 2 +- docs/what/relationship.md | 3 - docs/what/search-document.md | 1 - .../add_stateful_ingestion_to_source.md | 13 +- .../docs/dev_guides/reporting_telemetry.md | 2 +- .../docs/dev_guides/stateful.md | 16 +-- metadata-ingestion/docs/sources/gcs/README.md | 4 +- .../docs/sources/kafka-connect/README.md | 10 +- metadata-ingestion/docs/sources/s3/README.md | 4 +- .../examples/transforms/README.md | 2 +- .../source/usage/starburst_trino_usage.py | 3 - metadata-jobs/README.md | 4 +- metadata-models/docs/entities/dataPlatform.md | 4 +- 31 files changed, 236 insertions(+), 97 deletions(-) create mode 100644 docs-website/markdown-link-check-config.json diff --git a/README.md b/README.md index 951dcebad64986..79f85433fbc184 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ export const Logo = (props) => {
DataHub Logo
@@ -18,7 +18,7 @@ export const Logo = (props) => {

-DataHub +DataHub

@@ -156,7 +156,7 @@ Here are the companies that have officially adopted DataHub. Please feel free to - [DataHub Blog](https://blog.datahubproject.io/) - [DataHub YouTube Channel](https://www.youtube.com/channel/UC3qFQC5IiwR5fvWEqi_tJ5w) -- [Optum: Data Mesh via DataHub](https://optum.github.io/blog/2022/03/23/data-mesh-via-datahub/) +- [Optum: Data Mesh via DataHub](https://opensource.optum.com/blog/2022/03/23/data-mesh-via-datahub) - [Saxo Bank: Enabling Data Discovery in Data Mesh](https://medium.com/datahub-project/enabling-data-discovery-in-a-data-mesh-the-saxo-journey-451b06969c8f) - [Bringing The Power Of The DataHub Real-Time Metadata Graph To Everyone At Acryl Data](https://www.dataengineeringpodcast.com/acryl-data-datahub-metadata-graph-episode-230/) - [DataHub: Popular Metadata Architectures Explained](https://engineering.linkedin.com/blog/2020/datahub-popular-metadata-architectures-explained) diff --git a/docs-website/build.gradle b/docs-website/build.gradle index 370ae3eec91761..a213ec1ae8194d 100644 --- a/docs-website/build.gradle +++ b/docs-website/build.gradle @@ -89,7 +89,7 @@ task fastReload(type: YarnTask) { args = ['run', 'generate-rsync'] } -task yarnLint(type: YarnTask, dependsOn: [yarnInstall]) { +task yarnLint(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) { inputs.files(projectMdFiles) args = ['run', 'lint-check'] outputs.dir("dist") diff --git a/docs-website/markdown-link-check-config.json b/docs-website/markdown-link-check-config.json new file mode 100644 index 00000000000000..26e040edde6f79 --- /dev/null +++ b/docs-website/markdown-link-check-config.json @@ -0,0 +1,50 @@ +{ + "ignorePatterns": [ + { + "pattern": "^http://demo\\.datahubproject\\.io" + }, + { + "pattern": "^http://localhost" + }, + { + "pattern": "^http://www.famfamfam.com" + }, + { + "pattern": "^http://www.linkedin.com" + }, + { + "pattern": "\\.md$" + }, + { + "pattern":"\\.json$" + }, + { + "pattern":"\\.txt$" + }, + { + "pattern": "\\.java$" + }, + { + "pattern": "\\.md#.*$" + }, + { + "pattern": "^https://oauth2.googleapis.com/token" + }, + { + "pattern": "^https://login.microsoftonline.com/common/oauth2/na$" + }, + { + "pattern": "#v(\\d+)-(\\d+)-(\\d+)" + }, + { + "pattern": "^https://github.com/mohdsiddique$" + }, + { + "pattern": "^https://github.com/2x$" + }, + { + "pattern": "^https://github.com/datahub-project/datahub/assets/15873986/2f47d033-6c2b-483a-951d-e6d6b807f0d0%22%3E$" + } + ], + "aliveStatusCodes": [200, 206, 0, 999, 400, 401, 403] +} \ No newline at end of file diff --git a/docs-website/package.json b/docs-website/package.json index 400ef4143c786a..1722f921696927 100644 --- a/docs-website/package.json +++ b/docs-website/package.json @@ -17,7 +17,7 @@ "generate": "rm -rf genDocs genStatic && mkdir genDocs genStatic && yarn _generate-docs && mv docs/* genDocs/ && rmdir docs", "generate-rsync": "mkdir -p genDocs genStatic && yarn _generate-docs && rsync -v --checksum -r -h -i --delete docs/ genDocs && rm -rf docs", "lint": "prettier -w generateDocsDir.ts sidebars.js src/pages/index.js", - "lint-check": "prettier -l generateDocsDir.ts sidebars.js src/pages/index.js", + "lint-check": "prettier -l generateDocsDir.ts sidebars.js src/pages/index.js && find ./genDocs -name \\*.md -not -path \"./genDocs/python-sdk/models.md\" -print0 | xargs -0 -n1 markdown-link-check -p -q -c markdown-link-check-config.json", "lint-fix": "prettier --write generateDocsDir.ts sidebars.js src/pages/index.js" }, "dependencies": { @@ -37,6 +37,7 @@ "docusaurus-graphql-plugin": "0.5.0", "docusaurus-plugin-sass": "^0.2.1", "dotenv": "^16.0.1", + "markdown-link-check": "^3.11.2", "markprompt": "^0.1.7", "react": "^18.2.0", "react-dom": "18.2.0", diff --git a/docs-website/yarn.lock b/docs-website/yarn.lock index 0613fe71ef78ee..5698029bff70a8 100644 --- a/docs-website/yarn.lock +++ b/docs-website/yarn.lock @@ -3414,6 +3414,11 @@ async-validator@^4.1.0: resolved "https://registry.yarnpkg.com/async-validator/-/async-validator-4.2.5.tgz#c96ea3332a521699d0afaaceed510a54656c6339" integrity sha512-7HhHjtERjqlNbZtqNqy2rckN/SpOOlmDliet+lP7k+eKZEjPk3DgyeU9lIXLdeLz0uBbbVp+9Qdow9wJWgwwfg== +async@^3.2.4: + version "3.2.4" + resolved "https://registry.yarnpkg.com/async/-/async-3.2.4.tgz#2d22e00f8cddeb5fde5dd33522b56d1cf569a81c" + integrity sha512-iAB+JbDEGXhyIUavoDl9WP/Jj106Kz9DEn1DPgYw5ruDn0e3Wgi3sKFm55sASdGBNOQB8F59d9qQ7deqrHA8wQ== + asynckit@^0.4.0: version "0.4.0" resolved "https://registry.yarnpkg.com/asynckit/-/asynckit-0.4.0.tgz#c79ed97f7f34cb8f2ba1bc9790bcc366474b4b79" @@ -3765,6 +3770,11 @@ chalk@^4.0.0, chalk@^4.1.0, chalk@^4.1.2: ansi-styles "^4.1.0" supports-color "^7.1.0" +chalk@^5.2.0: + version "5.3.0" + resolved "https://registry.yarnpkg.com/chalk/-/chalk-5.3.0.tgz#67c20a7ebef70e7f3970a01f90fa210cb6860385" + integrity sha512-dLitG79d+GV1Nb/VYcCDFivJeK1hiukt9QjRNVOsUtTy1rR1YJsmpGGTZ3qJos+uw7WmWF4wUwBd9jxjocFC2w== + character-entities-legacy@^1.0.0: version "1.1.4" resolved "https://registry.yarnpkg.com/character-entities-legacy/-/character-entities-legacy-1.1.4.tgz#94bc1845dce70a5bb9d2ecc748725661293d8fc1" @@ -3797,7 +3807,7 @@ cheerio-select@^2.1.0: domhandler "^5.0.3" domutils "^3.0.1" -cheerio@^1.0.0-rc.12: +cheerio@^1.0.0-rc.10, cheerio@^1.0.0-rc.12: version "1.0.0-rc.12" resolved "https://registry.yarnpkg.com/cheerio/-/cheerio-1.0.0-rc.12.tgz#788bf7466506b1c6bf5fae51d24a2c4d62e47683" integrity sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q== @@ -3984,6 +3994,11 @@ comma-separated-tokens@^2.0.0: resolved "https://registry.yarnpkg.com/comma-separated-tokens/-/comma-separated-tokens-2.0.3.tgz#4e89c9458acb61bc8fef19f4529973b2392839ee" integrity sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg== +commander@^10.0.1: + version "10.0.1" + resolved "https://registry.yarnpkg.com/commander/-/commander-10.0.1.tgz#881ee46b4f77d1c1dccc5823433aa39b022cbe06" + integrity sha512-y4Mg2tXshplEbSGzx7amzPwKKOCGuoSRP/CjEdwwk0FOGlUbq6lKuoyDZTNZkmxHdJtp54hdfY/JUrdL7Xfdug== + commander@^2.20.0: version "2.20.3" resolved "https://registry.yarnpkg.com/commander/-/commander-2.20.3.tgz#fd485e84c03eb4881c20722ba48035e8531aeb33" @@ -4385,6 +4400,13 @@ debug@4, debug@^4.0.0, debug@^4.1.0, debug@^4.1.1: dependencies: ms "2.1.2" +debug@^3.2.6: + version "3.2.7" + resolved "https://registry.yarnpkg.com/debug/-/debug-3.2.7.tgz#72580b7e9145fb39b6676f9c5e5fb100b934179a" + integrity sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ== + dependencies: + ms "^2.1.1" + decode-named-character-reference@^1.0.0: version "1.0.2" resolved "https://registry.yarnpkg.com/decode-named-character-reference/-/decode-named-character-reference-1.0.2.tgz#daabac9690874c394c81e4162a0304b35d824f0e" @@ -5551,6 +5573,13 @@ html-entities@^2.3.2: resolved "https://registry.yarnpkg.com/html-entities/-/html-entities-2.4.0.tgz#edd0cee70402584c8c76cc2c0556db09d1f45061" integrity sha512-igBTJcNNNhvZFRtm8uA6xMY6xYleeDwn3PeBCkDz7tHttv4F2hsDI2aPgNERWzvRcNYHNT3ymRaQzllmXj4YsQ== +html-link-extractor@^1.0.5: + version "1.0.5" + resolved "https://registry.yarnpkg.com/html-link-extractor/-/html-link-extractor-1.0.5.tgz#a4be345cb13b8c3352d82b28c8b124bb7bf5dd6f" + integrity sha512-ADd49pudM157uWHwHQPUSX4ssMsvR/yHIswOR5CUfBdK9g9ZYGMhVSE6KZVHJ6kCkR0gH4htsfzU6zECDNVwyw== + dependencies: + cheerio "^1.0.0-rc.10" + html-minifier-terser@^6.0.2, html-minifier-terser@^6.1.0: version "6.1.0" resolved "https://registry.yarnpkg.com/html-minifier-terser/-/html-minifier-terser-6.1.0.tgz#bfc818934cc07918f6b3669f5774ecdfd48f32ab" @@ -5673,6 +5702,13 @@ iconv-lite@0.4.24: dependencies: safer-buffer ">= 2.1.2 < 3" +iconv-lite@^0.6.3: + version "0.6.3" + resolved "https://registry.yarnpkg.com/iconv-lite/-/iconv-lite-0.6.3.tgz#a52f80bf38da1952eb5c681790719871a1a72501" + integrity sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw== + dependencies: + safer-buffer ">= 2.1.2 < 3.0.0" + icss-utils@^5.0.0, icss-utils@^5.1.0: version "5.1.0" resolved "https://registry.yarnpkg.com/icss-utils/-/icss-utils-5.1.0.tgz#c6be6858abd013d768e98366ae47e25d5887b1ae" @@ -5795,6 +5831,11 @@ ipaddr.js@^2.0.1: resolved "https://registry.yarnpkg.com/ipaddr.js/-/ipaddr.js-2.1.0.tgz#2119bc447ff8c257753b196fc5f1ce08a4cdf39f" integrity sha512-LlbxQ7xKzfBusov6UMi4MFpEg0m+mAm9xyNGEduwXMEDuf4WfzB/RZwMVYEd7IKGvh4IUkEXYxtAVu9T3OelJQ== +is-absolute-url@^4.0.1: + version "4.0.1" + resolved "https://registry.yarnpkg.com/is-absolute-url/-/is-absolute-url-4.0.1.tgz#16e4d487d4fded05cfe0685e53ec86804a5e94dc" + integrity sha512-/51/TKE88Lmm7Gc4/8btclNXWS+g50wXhYJq8HWIBAGUBnoAdRu1aXeh364t/O7wXDAcTJDP8PNuNKWUDWie+A== + is-alphabetical@1.0.4, is-alphabetical@^1.0.0: version "1.0.4" resolved "https://registry.yarnpkg.com/is-alphabetical/-/is-alphabetical-1.0.4.tgz#9e7d6b94916be22153745d184c298cbf986a686d" @@ -5963,6 +6004,13 @@ is-regexp@^1.0.0: resolved "https://registry.yarnpkg.com/is-regexp/-/is-regexp-1.0.0.tgz#fd2d883545c46bac5a633e7b9a09e87fa2cb5069" integrity sha512-7zjFAPO4/gwyQAAgRRmqeEeyIICSdmCqa3tsVHMdBzaXXRiqopZL4Cyghg/XulGWrtABTpbnYYzzIRffLkP4oA== +is-relative-url@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/is-relative-url/-/is-relative-url-4.0.0.tgz#4d8371999ff6033b76e4d9972fb5bf496fddfa97" + integrity sha512-PkzoL1qKAYXNFct5IKdKRH/iBQou/oCC85QhXj6WKtUQBliZ4Yfd3Zk27RHu9KQG8r6zgvAA2AQKC9p+rqTszg== + dependencies: + is-absolute-url "^4.0.1" + is-root@^2.1.0: version "2.1.0" resolved "https://registry.yarnpkg.com/is-root/-/is-root-2.1.0.tgz#809e18129cf1129644302a4f8544035d51984a9c" @@ -6010,6 +6058,13 @@ isarray@~1.0.0: resolved "https://registry.yarnpkg.com/isarray/-/isarray-1.0.0.tgz#bb935d48582cba168c06834957a54a3e07124f11" integrity sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ== +isemail@^3.2.0: + version "3.2.0" + resolved "https://registry.yarnpkg.com/isemail/-/isemail-3.2.0.tgz#59310a021931a9fb06bbb51e155ce0b3f236832c" + integrity sha512-zKqkK+O+dGqevc93KNsbZ/TqTUFd46MwWjYOoMrjIMZ51eU7DtQG3Wmd9SQQT7i7RVnuTPEiYEWHU3MSbxC1Tg== + dependencies: + punycode "2.x.x" + isexe@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/isexe/-/isexe-2.0.0.tgz#e8fbf374dc556ff8947a10dcb0572d633f2cfa10" @@ -6205,6 +6260,16 @@ lines-and-columns@^1.1.6: resolved "https://registry.yarnpkg.com/lines-and-columns/-/lines-and-columns-1.2.4.tgz#eca284f75d2965079309dc0ad9255abb2ebc1632" integrity sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg== +link-check@^5.2.0: + version "5.2.0" + resolved "https://registry.yarnpkg.com/link-check/-/link-check-5.2.0.tgz#595a339d305900bed8c1302f4342a29c366bf478" + integrity sha512-xRbhYLaGDw7eRDTibTAcl6fXtmUQ13vkezQiTqshHHdGueQeumgxxmQMIOmJYsh2p8BF08t8thhDQ++EAOOq3w== + dependencies: + is-relative-url "^4.0.0" + isemail "^3.2.0" + ms "^2.1.3" + needle "^3.1.0" + loader-runner@^4.2.0: version "4.3.0" resolved "https://registry.yarnpkg.com/loader-runner/-/loader-runner-4.3.0.tgz#c1b4a163b99f614830353b16755e7149ac2314e1" @@ -6366,6 +6431,28 @@ markdown-escapes@^1.0.0: resolved "https://registry.yarnpkg.com/markdown-escapes/-/markdown-escapes-1.0.4.tgz#c95415ef451499d7602b91095f3c8e8975f78535" integrity sha512-8z4efJYk43E0upd0NbVXwgSTQs6cT3T06etieCMEg7dRbzCbxUCK/GHlX8mhHRDcp+OLlHkPKsvqQTCvsRl2cg== +markdown-link-check@^3.11.2: + version "3.11.2" + resolved "https://registry.yarnpkg.com/markdown-link-check/-/markdown-link-check-3.11.2.tgz#303a8a03d4a34c42ef3158e0b245bced26b5d904" + integrity sha512-zave+vI4AMeLp0FlUllAwGbNytSKsS3R2Zgtf3ufVT892Z/L6Ro9osZwE9PNA7s0IkJ4onnuHqatpsaCiAShJw== + dependencies: + async "^3.2.4" + chalk "^5.2.0" + commander "^10.0.1" + link-check "^5.2.0" + lodash "^4.17.21" + markdown-link-extractor "^3.1.0" + needle "^3.2.0" + progress "^2.0.3" + +markdown-link-extractor@^3.1.0: + version "3.1.0" + resolved "https://registry.yarnpkg.com/markdown-link-extractor/-/markdown-link-extractor-3.1.0.tgz#0d5a703630d791a9e2017449e1a9b294f2d2b676" + integrity sha512-r0NEbP1dsM+IqB62Ru9TXLP/HDaTdBNIeylYXumuBi6Xv4ufjE1/g3TnslYL8VNqNcGAGbMptQFHrrdfoZ/Sug== + dependencies: + html-link-extractor "^1.0.5" + marked "^4.1.0" + markdown-table@^3.0.0: version "3.0.3" resolved "https://registry.yarnpkg.com/markdown-table/-/markdown-table-3.0.3.tgz#e6331d30e493127e031dd385488b5bd326e4a6bd" @@ -6376,6 +6463,11 @@ marked@^2.0.3: resolved "https://registry.yarnpkg.com/marked/-/marked-2.1.3.tgz#bd017cef6431724fd4b27e0657f5ceb14bff3753" integrity sha512-/Q+7MGzaETqifOMWYEA7HVMaZb4XbcRfaOzcSsHZEith83KGlvaSG33u0SKu89Mj5h+T8V2hM+8O45Qc5XTgwA== +marked@^4.1.0: + version "4.3.0" + resolved "https://registry.yarnpkg.com/marked/-/marked-4.3.0.tgz#796362821b019f734054582038b116481b456cf3" + integrity sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A== + markprompt@^0.1.7: version "0.1.7" resolved "https://registry.yarnpkg.com/markprompt/-/markprompt-0.1.7.tgz#fa049e11109d93372c45c38b3ca40bd5fdf751ea" @@ -6978,7 +7070,7 @@ ms@2.1.2: resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.2.tgz#d09d1f357b443f493382a8eb3ccd183872ae6009" integrity sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w== -ms@2.1.3: +ms@2.1.3, ms@^2.1.1, ms@^2.1.3: version "2.1.3" resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.3.tgz#574c8138ce1d2b5861f0b44579dbadd60c6615b2" integrity sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA== @@ -7001,6 +7093,15 @@ napi-build-utils@^1.0.1: resolved "https://registry.yarnpkg.com/napi-build-utils/-/napi-build-utils-1.0.2.tgz#b1fddc0b2c46e380a0b7a76f984dd47c41a13806" integrity sha512-ONmRUqK7zj7DWX0D9ADe03wbwOBZxNAfF20PlGfCWQcD3+/MakShIHrMqx9YwPTfxDdF1zLeL+RGZiR9kGMLdg== +needle@^3.1.0, needle@^3.2.0: + version "3.2.0" + resolved "https://registry.yarnpkg.com/needle/-/needle-3.2.0.tgz#07d240ebcabfd65c76c03afae7f6defe6469df44" + integrity sha512-oUvzXnyLiVyVGoianLijF9O/RecZUf7TkBfimjGrLM4eQhXyeJwM6GeAWccwfQ9aa4gMCZKqhAOuLaMIcQxajQ== + dependencies: + debug "^3.2.6" + iconv-lite "^0.6.3" + sax "^1.2.4" + negotiator@0.6.3: version "0.6.3" resolved "https://registry.yarnpkg.com/negotiator/-/negotiator-0.6.3.tgz#58e323a72fedc0d6f9cd4d31fe49f51479590ccd" @@ -7753,6 +7854,11 @@ process-nextick-args@~2.0.0: resolved "https://registry.yarnpkg.com/process-nextick-args/-/process-nextick-args-2.0.1.tgz#7820d9b16120cc55ca9ae7792680ae7dba6d7fe2" integrity sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag== +progress@^2.0.3: + version "2.0.3" + resolved "https://registry.yarnpkg.com/progress/-/progress-2.0.3.tgz#7e8cf8d8f5b8f239c1bc68beb4eb78567d572ef8" + integrity sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA== + promise@^7.1.1: version "7.3.1" resolved "https://registry.yarnpkg.com/promise/-/promise-7.3.1.tgz#064b72602b18f90f29192b8b1bc418ffd1ebd3bf" @@ -7805,16 +7911,16 @@ pump@^3.0.0: end-of-stream "^1.1.0" once "^1.3.1" +punycode@2.x.x, punycode@^2.1.0: + version "2.3.0" + resolved "https://registry.yarnpkg.com/punycode/-/punycode-2.3.0.tgz#f67fa67c94da8f4d0cfff981aee4118064199b8f" + integrity sha512-rRV+zQD8tVFys26lAGR9WUuS4iUAngJScM+ZRSKtvl5tKeZ2t5bvdNFdNHBW9FWR4guGHlgmsZ1G7BSm2wTbuA== + punycode@^1.3.2: version "1.4.1" resolved "https://registry.yarnpkg.com/punycode/-/punycode-1.4.1.tgz#c0d5a63b2718800ad8e1eb0fa5269c84dd41845e" integrity sha512-jmYNElW7yvO7TV33CjSmvSiE2yco3bV2czu/OzDKdMNVZQWfxCblURLhf+47syQRBntjfLdd/H0egrzIG+oaFQ== -punycode@^2.1.0: - version "2.3.0" - resolved "https://registry.yarnpkg.com/punycode/-/punycode-2.3.0.tgz#f67fa67c94da8f4d0cfff981aee4118064199b8f" - integrity sha512-rRV+zQD8tVFys26lAGR9WUuS4iUAngJScM+ZRSKtvl5tKeZ2t5bvdNFdNHBW9FWR4guGHlgmsZ1G7BSm2wTbuA== - pupa@^2.1.1: version "2.1.1" resolved "https://registry.yarnpkg.com/pupa/-/pupa-2.1.1.tgz#f5e8fd4afc2c5d97828faa523549ed8744a20d62" @@ -8789,7 +8895,7 @@ safe-buffer@5.2.1, safe-buffer@>=5.1.0, safe-buffer@^5.0.1, safe-buffer@^5.1.0, resolved "https://registry.yarnpkg.com/safe-buffer/-/safe-buffer-5.2.1.tgz#1eaf9fa9bdb1fdd4ec75f58f9cdb4e6b7827eec6" integrity sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ== -"safer-buffer@>= 2.1.2 < 3": +"safer-buffer@>= 2.1.2 < 3", "safer-buffer@>= 2.1.2 < 3.0.0": version "2.1.2" resolved "https://registry.yarnpkg.com/safer-buffer/-/safer-buffer-2.1.2.tgz#44fa161b0187b9549dd84bb91802f9bd8385cd6a" integrity sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg== diff --git a/docs/advanced/no-code-modeling.md b/docs/advanced/no-code-modeling.md index d76b776d3dddb2..172e63f821eabd 100644 --- a/docs/advanced/no-code-modeling.md +++ b/docs/advanced/no-code-modeling.md @@ -100,10 +100,9 @@ Currently, there are various models in GMS: 1. [Urn](https://github.com/datahub-project/datahub/blob/master/li-utils/src/main/pegasus/com/linkedin/common/DatasetUrn.pdl) - Structs composing primary keys 2. [Root] [Snapshots](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/metadata/snapshot/Snapshot.pdl) - Container of aspects 3. [Aspects](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/DashboardAspect.pdl) - Optional container of fields -4. [Values](https://github.com/datahub-project/datahub/blob/master/gms/api/src/main/pegasus/com/linkedin/dataset/Dataset.pdl), [Keys](https://github.com/datahub-project/datahub/blob/master/gms/api/src/main/pegasus/com/linkedin/dataset/DatasetKey.pdl) - Model returned by GMS [Rest.li](http://rest.li) API (public facing) -5. [Entities](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/DatasetEntity.pdl) - Records with fields derived from the URN. Used only in graph / relationships -6. [Relationships](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/Relationship.pdl) - Edges between 2 entities with optional edge properties -7. [Search Documents](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/metadata/search/ChartDocument.pdl) - Flat documents for indexing within Elastic index +4. [Keys](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl) - Model returned by GMS [Rest.li](http://rest.li) API (public facing) +5. [Relationships](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/common/EntityRelationship.pdl) - Edges between 2 entities with optional edge properties +6. Search Documents - Flat documents for indexing within Elastic index - And corresponding index [mappings.json](https://github.com/datahub-project/datahub/blob/master/gms/impl/src/main/resources/index/chart/mappings.json), [settings.json](https://github.com/datahub-project/datahub/blob/master/gms/impl/src/main/resources/index/chart/settings.json) Various components of GMS depend on / make assumptions about these model types: diff --git a/docs/api/graphql/how-to-set-up-graphql.md b/docs/api/graphql/how-to-set-up-graphql.md index 584bf34ad3f92d..2be2f935b12b10 100644 --- a/docs/api/graphql/how-to-set-up-graphql.md +++ b/docs/api/graphql/how-to-set-up-graphql.md @@ -68,7 +68,7 @@ In the request body, select the `GraphQL` option and enter your GraphQL query in

-Please refer to [Querying with GraphQL](https://learning.postman.com/docs/sending-requests/graphql/graphql/) in the Postman documentation for more information. +Please refer to [Querying with GraphQL](https://learning.postman.com/docs/sending-requests/graphql/graphql-overview/) in the Postman documentation for more information. ### Authentication + Authorization diff --git a/docs/architecture/architecture.md b/docs/architecture/architecture.md index 6a9c1860d71b09..20f18f09d949be 100644 --- a/docs/architecture/architecture.md +++ b/docs/architecture/architecture.md @@ -17,7 +17,7 @@ The figures below describe the high-level architecture of DataHub.

- +

diff --git a/docs/authentication/guides/add-users.md b/docs/authentication/guides/add-users.md index f5dfc832010831..d380cacd6665e4 100644 --- a/docs/authentication/guides/add-users.md +++ b/docs/authentication/guides/add-users.md @@ -19,13 +19,13 @@ To do so, navigate to the **Users & Groups** section inside of Settings page. He do not have the correct privileges to invite users, this button will be disabled.

- +

To invite new users, simply share the link with others inside your organization.

- +

When a new user visits the link, they will be directed to a sign up screen where they can create their DataHub account. @@ -37,13 +37,13 @@ and click **Reset user password** inside the menu dropdown on the right hand sid `Manage User Credentials` [Platform Privilege](../../authorization/access-policies-guide.md) in order to reset passwords.

- +

To reset the password, simply share the password reset link with the user who needs to change their password. Password reset links expire after 24 hours.

- +

# Configuring Single Sign-On with OpenID Connect diff --git a/docs/authentication/guides/sso/configure-oidc-react.md b/docs/authentication/guides/sso/configure-oidc-react.md index d27792ce3967b1..512d6adbf916fc 100644 --- a/docs/authentication/guides/sso/configure-oidc-react.md +++ b/docs/authentication/guides/sso/configure-oidc-react.md @@ -26,7 +26,7 @@ please see [this guide](../jaas.md) to mount a custom user.props file for a JAAS To configure OIDC in React, you will most often need to register yourself as a client with your identity provider (Google, Okta, etc). Each provider may have their own instructions. Provided below are links to examples for Okta, Google, Azure AD, & Keycloak. -- [Registering an App in Okta](https://developer.okta.com/docs/guides/add-an-external-idp/apple/register-app-in-okta/) +- [Registering an App in Okta](https://developer.okta.com/docs/guides/add-an-external-idp/openidconnect/main/) - [OpenID Connect in Google Identity](https://developers.google.com/identity/protocols/oauth2/openid-connect) - [OpenID Connect authentication with Azure Active Directory](https://docs.microsoft.com/en-us/azure/active-directory/fundamentals/auth-oidc) - [Keycloak - Securing Applications and Services Guide](https://www.keycloak.org/docs/latest/securing_apps/) diff --git a/docs/cli.md b/docs/cli.md index eb8bb406b01074..267f289d9f54a6 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -547,7 +547,7 @@ Old Entities Migrated = {'urn:li:dataset:(urn:li:dataPlatform:hive,logging_event ### Using docker [![Docker Hub](https://img.shields.io/docker/pulls/acryldata/datahub-ingestion?style=plastic)](https://hub.docker.com/r/acryldata/datahub-ingestion) -[![datahub-ingestion docker](https://github.com/acryldata/datahub/actions/workflows/docker-ingestion.yml/badge.svg)](https://github.com/acryldata/datahub/actions/workflows/docker-ingestion.yml) +[![datahub-ingestion docker](https://github.com/acryldata/datahub/workflows/datahub-ingestion%20docker/badge.svg)](https://github.com/acryldata/datahub/actions/workflows/docker-ingestion.yml) If you don't want to install locally, you can alternatively run metadata ingestion within a Docker container. We have prebuilt images available on [Docker hub](https://hub.docker.com/r/acryldata/datahub-ingestion). All plugins will be installed and enabled automatically. diff --git a/docs/domains.md b/docs/domains.md index c846a753417c59..1b2ebc9d47f397 100644 --- a/docs/domains.md +++ b/docs/domains.md @@ -22,20 +22,20 @@ You can create this privileges by creating a new [Metadata Policy](./authorizati To create a Domain, first navigate to the **Domains** tab in the top-right menu of DataHub.

- +

Once you're on the Domains page, you'll see a list of all the Domains that have been created on DataHub. Additionally, you can view the number of entities inside each Domain.

- +

To create a new Domain, click '+ New Domain'.

- +

Inside the form, you can choose a name for your Domain. Most often, this will align with your business units or groups, for example @@ -48,7 +48,7 @@ for the Domain. This option is useful if you intend to refer to Domains by a com key to be human-readable. Proceed with caution: once you select a custom id, it cannot be easily changed.

- +

By default, you don't need to worry about this. DataHub will auto-generate a unique Domain id for you. @@ -64,7 +64,7 @@ To assign an asset to a Domain, simply navigate to the asset's profile page. At see a 'Domain' section. Click 'Set Domain', and then search for the Domain you'd like to add to. When you're done, click 'Add'.

- +

To remove an asset from a Domain, click the 'x' icon on the Domain tag. @@ -149,27 +149,27 @@ source: Once you've created a Domain, you can use the search bar to find it.

- +

Clicking on the search result will take you to the Domain's profile, where you can edit its description, add / remove owners, and view the assets inside the Domain.

- +

Once you've added assets to a Domain, you can filter search results to limit to those Assets within a particular Domain using the left-side search filters.

- +

On the homepage, you'll also find a list of the most popular Domains in your organization.

- +

## Additional Resources @@ -242,7 +242,6 @@ DataHub supports Tags, Glossary Terms, & Domains as distinct types of Metadata t - **Tags**: Informal, loosely controlled labels that serve as a tool for search & discovery. Assets may have multiple tags. No formal, central management. - **Glossary Terms**: A controlled vocabulary, with optional hierarchy. Terms are typically used to standardize types of leaf-level attributes (i.e. schema fields) for governance. E.g. (EMAIL_PLAINTEXT) - **Domains**: A set of top-level categories. Usually aligned to business units / disciplines to which the assets are most relevant. Central or distributed management. Single Domain assignment per data asset. - *Need more help? Join the conversation in [Slack](http://slack.datahubproject.io)!* ### Related Features diff --git a/docs/how/add-new-aspect.md b/docs/how/add-new-aspect.md index 6ea7256ed75cc0..d1fe567018903b 100644 --- a/docs/how/add-new-aspect.md +++ b/docs/how/add-new-aspect.md @@ -1,20 +1,20 @@ # How to add a new metadata aspect? Adding a new metadata [aspect](../what/aspect.md) is one of the most common ways to extend an existing [entity](../what/entity.md). -We'll use the [CorpUserEditableInfo](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl) as an example here. +We'll use the CorpUserEditableInfo as an example here. 1. Add the aspect model to the corresponding namespace (e.g. [`com.linkedin.identity`](https://github.com/datahub-project/datahub/tree/master/metadata-models/src/main/pegasus/com/linkedin/identity)) -2. Extend the entity's aspect union to include the new aspect (e.g. [`CorpUserAspect`](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/metadata/aspect/CorpUserAspect.pdl)) +2. Extend the entity's aspect union to include the new aspect. 3. Rebuild the rest.li [IDL & snapshot](https://linkedin.github.io/rest.li/modeling/compatibility_check) by running the following command from the project root ``` ./gradlew :metadata-service:restli-servlet-impl:build -Prest.model.compatibility=ignore ``` -4. To surface the new aspect at the top-level [resource endpoint](https://linkedin.github.io/rest.li/user_guide/restli_server#writing-resources), extend the resource data model (e.g. [`CorpUser`](https://github.com/datahub-project/datahub/blob/master/gms/api/src/main/pegasus/com/linkedin/identity/CorpUser.pdl)) with an optional field (e.g. [`editableInfo`](https://github.com/datahub-project/datahub/blob/master/gms/api/src/main/pegasus/com/linkedin/identity/CorpUser.pdl#L21)). You'll also need to extend the `toValue` & `toSnapshot` methods of the top-level resource (e.g. [`CorpUsers`](https://github.com/datahub-project/datahub/blob/master/gms/impl/src/main/java/com/linkedin/metadata/resources/identity/CorpUsers.java)) to convert between the snapshot & value models. +4. To surface the new aspect at the top-level [resource endpoint](https://linkedin.github.io/rest.li/user_guide/restli_server#writing-resources), extend the resource data model with an optional field. You'll also need to extend the `toValue` & `toSnapshot` methods of the top-level resource (e.g. [`CorpUsers`](https://github.com/datahub-project/datahub/blob/master/gms/impl/src/main/java/com/linkedin/metadata/resources/identity/CorpUsers.java)) to convert between the snapshot & value models. -5. (Optional) If there's need to update the aspect via API (instead of/in addition to MCE), add a [sub-resource](https://linkedin.github.io/rest.li/user_guide/restli_server#sub-resources) endpoint for the new aspect (e.g. [`CorpUsersEditableInfoResource`](https://github.com/datahub-project/datahub/blob/master/gms/impl/src/main/java/com/linkedin/metadata/resources/identity/CorpUsersEditableInfoResource.java)). The sub-resource endpiont also allows you to retrieve previous versions of the aspect as well as additional metadata such as the audit stamp. +5. (Optional) If there's need to update the aspect via API (instead of/in addition to MCE), add a [sub-resource](https://linkedin.github.io/rest.li/user_guide/restli_server#sub-resources) endpoint for the new aspect (e.g. `CorpUsersEditableInfoResource`). The sub-resource endpiont also allows you to retrieve previous versions of the aspect as well as additional metadata such as the audit stamp. -6. After rebuilding & restarting [gms](https://github.com/datahub-project/datahub/tree/master/gms), [mce-consumer-job](https://github.com/datahub-project/datahub/tree/master/metadata-jobs/mce-consumer-job) & [mae-consumer-job](https://github.com/datahub-project/datahub/tree/master/metadata-jobs/mae-consumer-job), +6. After rebuilding & restarting gms, [mce-consumer-job](https://github.com/datahub-project/datahub/tree/master/metadata-jobs/mce-consumer-job) & [mae-consumer-job](https://github.com/datahub-project/datahub/tree/master/metadata-jobs/mae-consumer-job),z you should be able to start emitting [MCE](../what/mxe.md) with the new aspect and have it automatically ingested & stored in DB. diff --git a/docs/modeling/extending-the-metadata-model.md b/docs/modeling/extending-the-metadata-model.md index 98f70f6d933e40..be2d7d795de701 100644 --- a/docs/modeling/extending-the-metadata-model.md +++ b/docs/modeling/extending-the-metadata-model.md @@ -24,7 +24,7 @@ We will refer to the two options as the **open-source fork** and **custom reposi ## This Guide This guide will outline what the experience of adding a new Entity should look like through a real example of adding the -Dashboard Entity. If you want to extend an existing Entity, you can skip directly to [Step 3](#step_3). +Dashboard Entity. If you want to extend an existing Entity, you can skip directly to [Step 3](#step-3-define-custom-aspects-or-attach-existing-aspects-to-your-entity). At a high level, an entity is made up of: @@ -82,14 +82,14 @@ Because they are aspects, keys need to be annotated with an @Aspect annotation, can be a part of. The key can also be annotated with the two index annotations: @Relationship and @Searchable. This instructs DataHub -infra to use the fields in the key to create relationships and index fields for search. See [Step 3](#step_3) for more details on +infra to use the fields in the key to create relationships and index fields for search. See [Step 3](#step-3-define-custom-aspects-or-attach-existing-aspects-to-your-entity) for more details on the annotation model. **Constraints**: Note that each field in a Key Aspect MUST be of String or Enum type. ### Step 2: Create the new entity with its key aspect -Define the entity within an `entity-registry.yml` file. Depending on your approach, the location of this file may vary. More on that in steps [4](#step_4) and [5](#step_5). +Define the entity within an `entity-registry.yml` file. Depending on your approach, the location of this file may vary. More on that in steps [4](#step-4-choose-a-place-to-store-your-model-extension) and [5](#step-5-attaching-your-non-key-aspects-to-the-entity). Example: ```yaml @@ -212,11 +212,11 @@ After you create your Aspect, you need to attach to all the entities that it app **Constraints**: Note that all aspects MUST be of type Record. -### Step 4: Choose a place to store your model extension +### Step 4: Choose a place to store your model extension At the beginning of this document, we walked you through a flow-chart that should help you decide whether you need to maintain a fork of the open source DataHub repo for your model extensions, or whether you can just use a model extension repository that can stay independent of the DataHub repo. Depending on what path you took, the place you store your aspect model files (the .pdl files) and the entity-registry files (the yaml file called `entity-registry.yaml` or `entity-registry.yml`) will vary. -- Open source Fork: Aspect files go under [`metadata-models`](../../metadata-models) module in the main repo, entity registry goes into [`metadata-models/src/main/resources/entity-registry.yml`](../../metadata-models/src/main/resources/entity-registry.yml). Read on for more details in [Step 5](#step_5). +- Open source Fork: Aspect files go under [`metadata-models`](../../metadata-models) module in the main repo, entity registry goes into [`metadata-models/src/main/resources/entity-registry.yml`](../../metadata-models/src/main/resources/entity-registry.yml). Read on for more details in [Step 5](#step-5-attaching-your-non-key-aspects-to-the-entity). - Custom repository: Read the [metadata-models-custom](../../metadata-models-custom/README.md) documentation to learn how to store and version your aspect models and registry. ### Step 5: Attaching your non-key Aspect(s) to the Entity diff --git a/docs/modeling/metadata-model.md b/docs/modeling/metadata-model.md index 037c9c7108a6e5..a8958985a0a724 100644 --- a/docs/modeling/metadata-model.md +++ b/docs/modeling/metadata-model.md @@ -433,7 +433,7 @@ aggregation query against a timeseries aspect. The *@TimeseriesField* and the *@TimeseriesFieldCollection* are two new annotations that can be attached to a field of a *Timeseries aspect* that allows it to be part of an aggregatable query. The kinds of aggregations allowed on these annotated fields depends on the type of the field, as well as the kind of aggregation, as -described [here](#Performing-an-aggregation-on-a-Timeseries-aspect). +described [here](#performing-an-aggregation-on-a-timeseries-aspect). * `@TimeseriesField = {}` - this annotation can be used with any type of non-collection type field of the aspect such as primitive types and records (see the fields *stat*, *strStat* and *strArray* fields @@ -515,7 +515,7 @@ my_emitter = DatahubRestEmitter("http://localhost:8080") my_emitter.emit(mcpw) ``` -###### Performing an aggregation on a Timeseries aspect. +###### Performing an aggregation on a Timeseries aspect Aggreations on timeseries aspects can be performed by the GMS REST API for `/analytics?action=getTimeseriesStats` which accepts the following params. diff --git a/docs/tags.md b/docs/tags.md index 945b514dc7b473..cb08c9fafea490 100644 --- a/docs/tags.md +++ b/docs/tags.md @@ -27,25 +27,25 @@ You can create these privileges by creating a new [Metadata Policy](./authorizat To add a tag at the dataset or container level, simply navigate to the page for that entity and click on the **Add Tag** button.

- +

Type in the name of the tag you want to add. You can add a new tag, or add a tag that already exists (the autocomplete will pull up the tag if it already exists).

- +

Click on the "Add" button and you'll see the tag has been added!

- +

If you would like to add a tag at the schema level, hover over the "Tags" column for a schema until the "Add Tag" button shows up, and then follow the same flow as above.

- +

### Removing a Tag @@ -57,7 +57,7 @@ To remove a tag, simply click on the "X" button in the tag. Then click "Yes" whe You can search for a tag in the search bar, and even filter entities by the presence of a specific tag.

- +

## Additional Resources diff --git a/docs/townhall-history.md b/docs/townhall-history.md index e235a70c5d7b95..d92905af0cd72c 100644 --- a/docs/townhall-history.md +++ b/docs/townhall-history.md @@ -328,7 +328,7 @@ November Town Hall (in December!) * Welcome - 5 mins * Latest React App Demo! ([video](https://www.youtube.com/watch?v=RQBEJhcen5E)) by John Joyce and Gabe Lyons - 5 mins -* Use-Case: DataHub at Geotab ([slides](https://docs.google.com/presentation/d/1qcgO3BW5NauuG0HnPqrxGcujsK-rJ1-EuU-7cbexkqE/edit?usp=sharing),[video](https://www.youtube.com/watch?v=boyjT2OrlU4)) by [John Yoon](https://www.linkedin.com/in/yhjyoon/) - 15 mins +* Use-Case: DataHub at Geotab ([video](https://www.youtube.com/watch?v=boyjT2OrlU4)) by [John Yoon](https://www.linkedin.com/in/yhjyoon/) - 15 mins * Tech Deep Dive: Tour of new pull-based Python Ingestion scripts ([slides](https://docs.google.com/presentation/d/15Xay596WDIhzkc5c8DEv6M-Bv1N4hP8quup1tkws6ms/edit#slide=id.gb478361595_0_10),[video](https://www.youtube.com/watch?v=u0IUQvG-_xI)) by [Harshal Sheth](https://www.linkedin.com/in/hsheth2/) - 15 mins * General Q&A from sign up sheet, slack, and participants - 15 mins * Closing remarks - 5 mins diff --git a/docs/what/gms.md b/docs/what/gms.md index 9e1cea1b9540e8..a39450d28ae83e 100644 --- a/docs/what/gms.md +++ b/docs/what/gms.md @@ -2,6 +2,4 @@ Metadata for [entities](entity.md) [onboarded](../modeling/metadata-model.md) to [GMA](gma.md) is served through microservices known as Generalized Metadata Service (GMS). GMS typically provides a [Rest.li](http://rest.li) API and must access the metadata using [GMA DAOs](../architecture/metadata-serving.md). -While a GMS is completely free to define its public APIs, we do provide a list of [resource base classes](https://github.com/datahub-project/datahub-gma/tree/master/restli-resources/src/main/java/com/linkedin/metadata/restli) to leverage for common patterns. - -GMA is designed to support a distributed fleet of GMS, each serving a subset of the [GMA graph](graph.md). However, for simplicity we include a single centralized GMS ([datahub-gms](../../gms)) that serves all entities. +GMA is designed to support a distributed fleet of GMS, each serving a subset of the [GMA graph](graph.md). However, for simplicity we include a single centralized GMS that serves all entities. diff --git a/docs/what/mxe.md b/docs/what/mxe.md index 8af96360858a33..25294e04ea3d92 100644 --- a/docs/what/mxe.md +++ b/docs/what/mxe.md @@ -266,7 +266,7 @@ A Metadata Change Event represents a request to change multiple aspects for the It leverages a deprecated concept of `Snapshot`, which is a strongly-typed list of aspects for the same entity. -A MCE is a "proposal" for a set of metadata changes, as opposed to [MAE](#metadata-audit-event), which is conveying a committed change. +A MCE is a "proposal" for a set of metadata changes, as opposed to [MAE](#metadata-audit-event-mae), which is conveying a committed change. Consequently, only successfully accepted and processed MCEs will lead to the emission of a corresponding MAE / MCLs. ### Emission diff --git a/docs/what/relationship.md b/docs/what/relationship.md index dcfe093a1b1245..d5348dc04b3c01 100644 --- a/docs/what/relationship.md +++ b/docs/what/relationship.md @@ -102,9 +102,6 @@ For one, the actual direction doesn’t really impact the execution of graph que That being said, generally there’s a more "natural way" to specify the direction of a relationship, which closely relate to how the metadata is stored. For example, the membership information for an LDAP group is generally stored as a list in group’s metadata. As a result, it’s more natural to model a `HasMember` relationship that points from a group to a member, instead of a `IsMemberOf` relationship pointing from member to group. -Since all relationships are explicitly declared, it’s fairly easy for a user to discover what relationships are available and their directionality by inspecting -the [relationships directory](../../metadata-models/src/main/pegasus/com/linkedin/metadata/relationship). It’s also possible to provide a UI for the catalog of entities and relationships for analysts who are interested in building complex graph queries to gain insights into the metadata. - ## High Cardinality Relationships See [this doc](../advanced/high-cardinality.md) for suggestions on how to best model relationships with high cardinality. diff --git a/docs/what/search-document.md b/docs/what/search-document.md index 81359a55d0caec..bd27656e512c3a 100644 --- a/docs/what/search-document.md +++ b/docs/what/search-document.md @@ -13,7 +13,6 @@ As a result, one may be tempted to add as many attributes as needed. This is acc Below shows an example schema for the `User` search document. Note that: 1. Each search document is required to have a type-specific `urn` field, generally maps to an entity in the [graph](graph.md). 2. Similar to `Entity`, each document has an optional `removed` field for "soft deletion". -This is captured in [BaseDocument](../../metadata-models/src/main/pegasus/com/linkedin/metadata/search/BaseDocument.pdl), which is expected to be included by all documents. 3. Similar to `Entity`, all remaining fields are made `optional` to support partial updates. 4. `management` shows an example of a string array field. 5. `ownedDataset` shows an example on how a field can be derived from metadata [aspects](aspect.md) associated with other types of entity (in this case, `Dataset`). diff --git a/metadata-ingestion/docs/dev_guides/add_stateful_ingestion_to_source.md b/metadata-ingestion/docs/dev_guides/add_stateful_ingestion_to_source.md index 6a1204fb0f2b35..9e39d24fb85782 100644 --- a/metadata-ingestion/docs/dev_guides/add_stateful_ingestion_to_source.md +++ b/metadata-ingestion/docs/dev_guides/add_stateful_ingestion_to_source.md @@ -60,16 +60,14 @@ class StaleEntityCheckpointStateBase(CheckpointStateBase, ABC, Generic[Derived]) ``` Examples: -1. [KafkaCheckpointState](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/source/state/kafka_state.py#L11). -2. [DbtCheckpointState](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/source/state/dbt_state.py#L16) -3. [BaseSQLAlchemyCheckpointState](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/source/state/sql_common_state.py#L17) +* [BaseSQLAlchemyCheckpointState](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/source/state/sql_common_state.py#L17) ### 2. Modifying the SourceConfig The source's config must inherit from `StatefulIngestionConfigBase`, and should declare a field named `stateful_ingestion` of type `Optional[StatefulStaleMetadataRemovalConfig]`. Examples: -1. The `KafkaSourceConfig` +- The `KafkaSourceConfig` ```python from typing import List, Optional import pydantic @@ -84,9 +82,6 @@ class KafkaSourceConfig(StatefulIngestionConfigBase): stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None ``` -2. The [DBTStatefulIngestionConfig](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/source/dbt.py#L131) - and the [DBTConfig](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/source/dbt.py#L317). - ### 3. Modifying the SourceReport The report class of the source should inherit from `StaleEntityRemovalSourceReport` whose definition is shown below. ```python @@ -102,7 +97,7 @@ class StaleEntityRemovalSourceReport(StatefulIngestionReport): ``` Examples: -1. The `KafkaSourceReport` +* The `KafkaSourceReport` ```python from dataclasses import dataclass from datahub.ingestion.source.state.stale_entity_removal_handler import StaleEntityRemovalSourceReport @@ -110,7 +105,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import StaleEnt class KafkaSourceReport(StaleEntityRemovalSourceReport): # Date: Thu, 14 Sep 2023 11:40:38 +0530 Subject: [PATCH 18/65] docs(managed datahub): release notes 0.2.11 (#8830) --- docs-website/sidebars.js | 1 + .../managed-datahub/release-notes/v_0_2_11.md | 73 +++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 docs/managed-datahub/release-notes/v_0_2_11.md diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index fcf82b786a1b95..12691e9f8268a5 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -597,6 +597,7 @@ module.exports = { }, { "Managed DataHub Release History": [ + "docs/managed-datahub/release-notes/v_0_2_11", "docs/managed-datahub/release-notes/v_0_2_10", "docs/managed-datahub/release-notes/v_0_2_9", "docs/managed-datahub/release-notes/v_0_2_8", diff --git a/docs/managed-datahub/release-notes/v_0_2_11.md b/docs/managed-datahub/release-notes/v_0_2_11.md new file mode 100644 index 00000000000000..1f420908487127 --- /dev/null +++ b/docs/managed-datahub/release-notes/v_0_2_11.md @@ -0,0 +1,73 @@ +# v0.2.11 +--- + +Release Availability Date +--- +14-Sep-2023 + +Recommended CLI/SDK +--- +- `v0.11.0` with release notes at https://github.com/acryldata/datahub/releases/tag/v0.10.5.5 +- [Deprecation] In LDAP ingestor, the manager_pagination_enabled changed to general pagination_enabled + +If you are using an older CLI/SDK version then please upgrade it. This applies for all CLI/SDK usages, if you are using it through your terminal, github actions, airflow, in python SDK somewhere, Java SKD etc. This is a strong recommendation to upgrade as we keep on pushing fixes in the CLI and it helps us support you better. + +Special Notes +--- +- Deployment process for this release is going to have a downtime when systme will be in a read only mode. A rough estimate 1 hour for every 2.3 million entities (includes soft-deleted entities). + + +## Release Changelog +--- +- Since `v0.2.10` these changes from OSS DataHub https://github.com/datahub-project/datahub/compare/2b0952195b7895df0a2bf92b28e71aac18217781...75252a3d9f6a576904be5a0790d644b9ae2df6ac have been pulled in. +- Misc fixes & features + - Proposals + - Group names shown correctly for proposal Inbox + - Metadata tests + - Deprecate/Un-deprecate actions available in Metadata tests + - Last Observed (in underlying sql) available as a filter in metadata tests + - [Breaking change] Renamed `__lastUpdated` -> `__created` as a filter to correctly represent what it was. This was not surfaced in the UI. But if you were using it then this needs to be renamed. Acryl Customer Success team will keep an eye out to pro-actively find and bring this up if you are affected by this. + - Robustness improvements to metadata test runs + - Copy urn for metadata tests to allow for easier filtering for iteration over metadata test results via our APIs. + - A lot more fixes to subscriptions, notifications and Observability (Beta). + - Some performance improvements to lineage queries + +## Some notable features in this SaaS release +- We now enable you to create and delete pinned announcements on your DataHub homepage! If you have the “Manage Home Page Posts” platform privilege you’ll see a new section in settings called “Home Page Posts” where you can create and delete text posts and link posts that your users see on the home page. +- Improvements to search experience +
+