From 01a2c0c77944759c779ae06dc44198f956ab2da9 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Wed, 18 Dec 2024 19:02:44 +0530 Subject: [PATCH 1/8] fix(ingest/kafka): update dependency, tests (#12159) --- metadata-ingestion/setup.py | 2 +- metadata-ingestion/tests/integration/kafka/test_kafka.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 31db711592eb1..6334b3abbb8a0 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -76,7 +76,7 @@ # now provide prebuilt wheels for most platforms, including M1 Macs and # Linux aarch64 (e.g. Docker's linux/arm64). Installing confluent_kafka # from source remains a pain. - "confluent_kafka>=1.9.0", + "confluent_kafka[schemaregistry]>=1.9.0", # We currently require both Avro libraries. The codegen uses avro-python3 (above) # schema parsers at runtime for generating and reading JSON into Python objects. # At the same time, we use Kafka's AvroSerializer, which internally relies on diff --git a/metadata-ingestion/tests/integration/kafka/test_kafka.py b/metadata-ingestion/tests/integration/kafka/test_kafka.py index 0d9a714625e96..648c4b26b20a7 100644 --- a/metadata-ingestion/tests/integration/kafka/test_kafka.py +++ b/metadata-ingestion/tests/integration/kafka/test_kafka.py @@ -102,7 +102,7 @@ def test_kafka_test_connection(mock_kafka_service, config_dict, is_success): test_connection_helpers.assert_capability_report( capability_report=report.capability_report, failure_capabilities={ - SourceCapability.SCHEMA_METADATA: "Failed to establish a new connection" + SourceCapability.SCHEMA_METADATA: "[Errno 111] Connection refused" }, ) From 8c724dbf47dd76a4aefec0a93267e08ddeda7e58 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Wed, 18 Dec 2024 12:45:38 -0600 Subject: [PATCH 2/8] feat(api): authorization extended for soft-delete and suspend (#12158) --- datahub-frontend/app/auth/AuthModule.java | 2 + .../upgrade/config/SystemUpdateConfig.java | 2 + .../restorebackup/RestoreStorageStep.java | 2 +- .../upgrade/system/AbstractMCLStep.java | 3 +- .../bootstrapmcps/BootstrapMCPUtil.java | 4 +- ...ateSchemaFieldsFromSchemaMetadataStep.java | 10 +- ...chemaFieldsFromSchemaMetadataStepTest.java | 3 +- .../aspect/CachingAspectRetriever.java | 36 +++- .../metadata/aspect/GraphRetriever.java | 23 +++ .../metadata/entity/SearchRetriever.java | 19 ++ .../metadata/aspect/MockAspectRetriever.java | 4 +- .../java/com/linkedin/metadata/Constants.java | 2 + .../ebean/batch/AspectsBatchImplTest.java | 8 +- .../aspect/utils/DefaultAspectsUtil.java | 2 +- .../client/EntityClientAspectRetriever.java | 7 +- .../metadata/client/JavaEntityClient.java | 21 ++- .../client/SystemJavaEntityClient.java | 2 +- .../entity/EntityServiceAspectRetriever.java | 10 +- .../metadata/entity/EntityServiceImpl.java | 67 +++---- .../linkedin/metadata/entity/EntityUtils.java | 2 +- .../cassandra/CassandraRetentionService.java | 2 +- .../entity/ebean/EbeanRetentionService.java | 2 +- .../query/filter/BaseQueryFilterRewriter.java | 2 +- .../SearchDocumentTransformer.java | 2 - .../BusinessAttributeUpdateHookService.java | 4 +- .../service/UpdateGraphIndicesService.java | 3 +- .../service/UpdateIndicesService.java | 5 +- .../metadata/AspectIngestionUtils.java | 12 +- .../hooks/IgnoreUnknownMutatorTest.java | 12 +- .../aspect/utils/DefaultAspectsUtilTest.java | 3 +- .../DataProductUnsetSideEffectTest.java | 8 +- .../entity/EbeanEntityServiceTest.java | 36 ++-- .../metadata/entity/EntityServiceTest.java | 118 ++++++------ .../cassandra/CassandraEntityServiceTest.java | 11 +- .../ebean/batch/ChangeItemImplTest.java | 4 +- .../RecommendationsServiceTest.java | 3 +- .../SchemaFieldSideEffectTest.java | 12 +- .../ContainerExpansionRewriterTest.java | 5 +- .../filter/DomainExpansionRewriterTest.java | 9 +- .../request/AggregationQueryBuilderTest.java | 9 +- .../request/SearchRequestHandlerTest.java | 1 + .../SearchDocumentTransformerTest.java | 12 ++ ...ropertyDefinitionDeleteSideEffectTest.java | 12 +- .../ShowPropertyAsBadgeValidatorTest.java | 2 +- .../io/datahubproject/test/DataGenerator.java | 5 +- .../MCLSpringCommonTestConfiguration.java | 3 +- .../hook/BusinessAttributeUpdateHookTest.java | 16 +- .../metadata/context/ActorContext.java | 48 +++++ .../metadata/context/OperationContext.java | 123 ++++++++----- .../metadata/context/RetrieverContext.java | 29 +++ .../exception/ActorAccessException.java | 7 + .../exception/OperationContextException.java | 9 + .../context/TestOperationContexts.java | 139 ++++++-------- .../context/OperationContextTest.java | 3 +- .../token/StatefulTokenService.java | 2 +- .../src/main/resources/application.yaml | 6 +- .../SystemOperationContextFactory.java | 14 +- .../IngestDataPlatformInstancesStep.java | 4 +- .../boot/steps/IngestPoliciesStep.java | 2 +- .../GlobalControllerExceptionHandler.java | 14 +- .../controller/GenericEntitiesController.java | 8 +- .../openapi/operations/test/IdController.java | 54 ++++++ .../openapi/util/MappingUtil.java | 2 +- .../v2/controller/EntityController.java | 4 +- .../v3/controller/EntityController.java | 4 +- ...m.linkedin.entity.entitiesV2.restspec.json | 8 + ...m.linkedin.entity.entitiesV2.snapshot.json | 8 + .../linkedin/entity/client/EntityClient.java | 71 ++++++- .../entity/client/RestliEntityClient.java | 13 +- .../client/SystemRestliEntityClient.java | 2 +- .../resources/entity/AspectResource.java | 2 +- .../resources/entity/EntityV2Resource.java | 10 +- .../resources/restli/RestliConstants.java | 3 + .../resources/restli/RestliUtils.java | 8 + .../resources/entity/AspectResourceTest.java | 2 +- .../tokens/revokable_access_token_test.py | 44 +---- .../tests/tokens/session_access_token_test.py | 173 ++++++++++++++++++ smoke-test/tests/tokens/token_utils.py | 53 ++++++ 78 files changed, 980 insertions(+), 431 deletions(-) create mode 100644 metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java create mode 100644 metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java rename metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/{ => config}/GlobalControllerExceptionHandler.java (81%) create mode 100644 metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/test/IdController.java create mode 100644 smoke-test/tests/tokens/session_access_token_test.py create mode 100644 smoke-test/tests/tokens/token_utils.py diff --git a/datahub-frontend/app/auth/AuthModule.java b/datahub-frontend/app/auth/AuthModule.java index 7fa99ab3cb262..b95515684f01f 100644 --- a/datahub-frontend/app/auth/AuthModule.java +++ b/datahub-frontend/app/auth/AuthModule.java @@ -27,6 +27,7 @@ import io.datahubproject.metadata.context.EntityRegistryContext; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.OperationContextConfig; +import io.datahubproject.metadata.context.RetrieverContext; import io.datahubproject.metadata.context.SearchContext; import io.datahubproject.metadata.context.ValidationContext; import java.nio.charset.StandardCharsets; @@ -195,6 +196,7 @@ protected OperationContext provideOperationContext( .searchContext(SearchContext.EMPTY) .entityRegistryContext(EntityRegistryContext.builder().build(EmptyEntityRegistry.EMPTY)) .validationContext(ValidationContext.builder().alternateValidation(false).build()) + .retrieverContext(RetrieverContext.EMPTY) .build(systemAuthentication); } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java index 661717c6309cf..fdd84da6044f7 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java @@ -13,6 +13,7 @@ import com.linkedin.gms.factory.kafka.common.TopicConventionFactory; import com.linkedin.gms.factory.kafka.schemaregistry.InternalSchemaRegistryFactory; import com.linkedin.gms.factory.search.BaseElasticSearchComponentsFactory; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.config.kafka.KafkaConfiguration; import com.linkedin.metadata.dao.producer.KafkaEventProducer; import com.linkedin.metadata.dao.producer.KafkaHealthChecker; @@ -186,6 +187,7 @@ protected OperationContext javaSystemOperationContext( components.getIndexConvention(), RetrieverContext.builder() .aspectRetriever(entityServiceAspectRetriever) + .cachingAspectRetriever(CachingAspectRetriever.EMPTY) .graphRetriever(systemGraphRetriever) .searchRetriever(searchServiceSearchRetriever) .build(), diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java index 4d53b603c1eaf..1e5cd6cdb2417 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java @@ -180,7 +180,7 @@ private void readerExecutable(ReaderWrapper reader, UpgradeContext context) { try { aspectRecord = EntityUtils.toSystemAspect( - context.opContext().getRetrieverContext().get(), aspect.toEntityAspect()) + context.opContext().getRetrieverContext(), aspect.toEntityAspect()) .get() .getRecordTemplate(); } catch (Exception e) { diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java index cd7947ce3c11a..56feffd211bcd 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java @@ -113,8 +113,7 @@ public Function executable() { List, SystemAspect>> futures; futures = EntityUtils.toSystemAspectFromEbeanAspects( - opContext.getRetrieverContext().get(), - batch.collect(Collectors.toList())) + opContext.getRetrieverContext(), batch.collect(Collectors.toList())) .stream() .map( systemAspect -> { diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java index 4cc3edff3eb52..5b807c6c450af 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java @@ -100,8 +100,8 @@ static AspectsBatch generateAspectBatch( .collect(Collectors.toList()); return AspectsBatchImpl.builder() - .mcps(mcps, auditStamp, opContext.getRetrieverContext().get()) - .retrieverContext(opContext.getRetrieverContext().get()) + .mcps(mcps, auditStamp, opContext.getRetrieverContext()) + .retrieverContext(opContext.getRetrieverContext()) .build(); } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java index 55bc8edbf6a76..de03538907432 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java @@ -168,13 +168,13 @@ public Function executable() { AspectsBatch aspectsBatch = AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items( batch .flatMap( ebeanAspectV2 -> EntityUtils.toSystemAspectFromEbeanAspects( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), Set.of(ebeanAspectV2)) .stream()) .map( @@ -189,11 +189,7 @@ public Function executable() { .auditStamp(systemAspect.getAuditStamp()) .systemMetadata( withAppSource(systemAspect.getSystemMetadata())) - .build( - opContext - .getRetrieverContext() - .get() - .getAspectRetriever())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList())) .build(); diff --git a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java index 3a2728b4e1d3d..04b1095e770e0 100644 --- a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java +++ b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java @@ -22,7 +22,6 @@ import com.linkedin.upgrade.DataHubUpgradeState; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.RetrieverContext; -import java.util.Optional; import java.util.stream.Stream; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -48,7 +47,7 @@ public void setup() { step = new GenerateSchemaFieldsFromSchemaMetadataStep( mockOpContext, mockEntityService, mockAspectDao, 10, 100, 1000); - when(mockOpContext.getRetrieverContext()).thenReturn(Optional.of(mockRetrieverContext)); + when(mockOpContext.getRetrieverContext()).thenReturn(mockRetrieverContext); } /** Test to verify the correct step ID is returned. */ diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java index 77e799f752455..375dd8cf8911e 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java @@ -1,4 +1,38 @@ package com.linkedin.metadata.aspect; +import com.linkedin.common.urn.Urn; +import com.linkedin.entity.Aspect; +import com.linkedin.metadata.models.registry.EmptyEntityRegistry; +import com.linkedin.metadata.models.registry.EntityRegistry; +import java.util.Collections; +import java.util.Map; +import java.util.Set; +import javax.annotation.Nonnull; + /** Responses can be cached based on application.yaml caching configuration for the EntityClient */ -public interface CachingAspectRetriever extends AspectRetriever {} +public interface CachingAspectRetriever extends AspectRetriever { + + CachingAspectRetriever EMPTY = new EmptyAspectRetriever(); + + class EmptyAspectRetriever implements CachingAspectRetriever { + @Nonnull + @Override + public Map> getLatestAspectObjects( + Set urns, Set aspectNames) { + return Collections.emptyMap(); + } + + @Nonnull + @Override + public Map> getLatestSystemAspects( + Map> urnAspectNames) { + return Collections.emptyMap(); + } + + @Nonnull + @Override + public EntityRegistry getEntityRegistry() { + return EmptyEntityRegistry.EMPTY; + } + } +} diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java index f6858e7da4ba6..30a2c1eb9df8c 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java @@ -4,6 +4,7 @@ import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.RelationshipFilter; import com.linkedin.metadata.query.filter.SortCriterion; +import java.util.Collections; import java.util.List; import java.util.function.Function; import javax.annotation.Nonnull; @@ -97,4 +98,26 @@ default void consumeRelatedEntities( } } } + + GraphRetriever EMPTY = new EmptyGraphRetriever(); + + class EmptyGraphRetriever implements GraphRetriever { + + @Nonnull + @Override + public RelatedEntitiesScrollResult scrollRelatedEntities( + @Nullable List sourceTypes, + @Nonnull Filter sourceEntityFilter, + @Nullable List destinationTypes, + @Nonnull Filter destinationEntityFilter, + @Nonnull List relationshipTypes, + @Nonnull RelationshipFilter relationshipFilter, + @Nonnull List sortCriterion, + @Nullable String scrollId, + int count, + @Nullable Long startTimeMillis, + @Nullable Long endTimeMillis) { + return new RelatedEntitiesScrollResult(0, 0, null, Collections.emptyList()); + } + } } diff --git a/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java b/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java index eaa106b8d1f63..d4894c97015f8 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java @@ -2,6 +2,7 @@ import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.search.ScrollResult; +import com.linkedin.metadata.search.SearchEntityArray; import java.util.List; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -21,4 +22,22 @@ ScrollResult scroll( @Nullable Filter filters, @Nullable String scrollId, int count); + + SearchRetriever EMPTY = new EmptySearchRetriever(); + + class EmptySearchRetriever implements SearchRetriever { + + @Override + public ScrollResult scroll( + @Nonnull List entities, + @Nullable Filter filters, + @Nullable String scrollId, + int count) { + ScrollResult empty = new ScrollResult(); + empty.setEntities(new SearchEntityArray()); + empty.setNumEntities(0); + empty.setPageSize(0); + return empty; + } + } } diff --git a/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java b/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java index 65705f15022b6..98a6d59004a92 100644 --- a/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java +++ b/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java @@ -5,7 +5,7 @@ import com.linkedin.data.DataMap; import com.linkedin.data.template.RecordTemplate; import com.linkedin.entity.Aspect; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.mxe.SystemMetadata; @@ -22,7 +22,7 @@ import javax.annotation.Nonnull; import org.mockito.Mockito; -public class MockAspectRetriever implements AspectRetriever { +public class MockAspectRetriever implements CachingAspectRetriever { private final Map> data; private final Map> systemData = new HashMap<>(); diff --git a/li-utils/src/main/java/com/linkedin/metadata/Constants.java b/li-utils/src/main/java/com/linkedin/metadata/Constants.java index ff6a79108600a..09f873ebf7bc9 100644 --- a/li-utils/src/main/java/com/linkedin/metadata/Constants.java +++ b/li-utils/src/main/java/com/linkedin/metadata/Constants.java @@ -409,6 +409,8 @@ public class Constants { /** User Status */ public static final String CORP_USER_STATUS_ACTIVE = "ACTIVE"; + public static final String CORP_USER_STATUS_SUSPENDED = "SUSPENDED"; + /** Task Runs */ public static final String DATA_PROCESS_INSTANCE_ENTITY_NAME = "dataProcessInstance"; diff --git a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java index 9f57d36f800de..a3099b9ee21ea 100644 --- a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java +++ b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java @@ -16,7 +16,7 @@ import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor; import com.linkedin.dataset.DatasetProperties; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.patch.GenericJsonPatch; @@ -56,7 +56,7 @@ public class AspectsBatchImplTest { private EntityRegistry testRegistry; - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private RetrieverContext retrieverContext; @BeforeTest @@ -75,12 +75,12 @@ public void beforeTest() throws EntityRegistryException { @BeforeMethod public void setup() { - this.mockAspectRetriever = mock(AspectRetriever.class); + this.mockAspectRetriever = mock(CachingAspectRetriever.class); when(this.mockAspectRetriever.getEntityRegistry()).thenReturn(testRegistry); this.retrieverContext = RetrieverContext.builder() .searchRetriever(mock(SearchRetriever.class)) - .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever(mockAspectRetriever) .graphRetriever(mock(GraphRetriever.class)) .build(); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java b/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java index 99eadd223acd1..82bc0ae1409c5 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java @@ -137,7 +137,7 @@ public static List getAdditionalChanges( getProposalFromAspectForDefault( entry.getKey(), entry.getValue(), entityKeyAspect, templateItem), templateItem.getAuditStamp(), - opContext.getAspectRetrieverOpt().get())) + opContext.getAspectRetriever())) .filter(Objects::nonNull); }) .collect(Collectors.toList()); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java b/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java index bba8324d0c561..669ec751f87c6 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java @@ -35,7 +35,7 @@ public EntityRegistry getEntityRegistry() { @Override public Aspect getLatestAspectObject(@Nonnull Urn urn, @Nonnull String aspectName) { try { - return entityClient.getLatestAspectObject(systemOperationContext, urn, aspectName); + return entityClient.getLatestAspectObject(systemOperationContext, urn, aspectName, false); } catch (RemoteInvocationException | URISyntaxException e) { throw new RuntimeException(e); } @@ -49,7 +49,7 @@ public Map> getLatestAspectObjects( return Map.of(); } else { try { - return entityClient.getLatestAspects(systemOperationContext, urns, aspectNames); + return entityClient.getLatestAspects(systemOperationContext, urns, aspectNames, false); } catch (RemoteInvocationException | URISyntaxException e) { throw new RuntimeException(e); } @@ -70,7 +70,8 @@ public Map> getLatestSystemAspects( urnAspectNames.keySet(), urnAspectNames.values().stream() .flatMap(Collection::stream) - .collect(Collectors.toSet())); + .collect(Collectors.toSet()), + false); } catch (RemoteInvocationException | URISyntaxException e) { throw new RuntimeException(e); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java index 29faa3955ea66..3d35f5956b0f4 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java @@ -106,11 +106,17 @@ public EntityResponse getV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Urn urn, - @Nullable final Set aspectNames) + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { final Set projectedAspects = aspectNames == null ? opContext.getEntityAspectNames(entityName) : aspectNames; - return entityService.getEntityV2(opContext, entityName, urn, projectedAspects); + return entityService.getEntityV2( + opContext, + entityName, + urn, + projectedAspects, + alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect); } @Override @@ -126,7 +132,8 @@ public Map batchGetV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull Set urns, - @Nullable Set aspectNames) + @Nullable Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { final Set projectedAspects = aspectNames == null ? opContext.getEntityAspectNames(entityName) : aspectNames; @@ -139,7 +146,11 @@ public Map batchGetV2( try { responseMap.putAll( entityService.getEntitiesV2( - opContext, entityName, new HashSet<>(batch), projectedAspects)); + opContext, + entityName, + new HashSet<>(batch), + projectedAspects, + alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect)); } catch (URISyntaxException e) { throw new RuntimeException(e); } @@ -772,7 +783,7 @@ public List batchIngestProposals( .mcps( batch, auditStamp, - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), opContext.getValidationContext().isAlternateValidation()) .build(); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java index eda9b3a880228..1d2fd422d7f46 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java @@ -89,6 +89,6 @@ public Map batchGetV2NoCache( @Nonnull Set urns, @Nullable Set aspectNames) throws RemoteInvocationException, URISyntaxException { - return super.batchGetV2(opContext, entityName, urns, aspectNames); + return super.batchGetV2(opContext, entityName, urns, aspectNames, false); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java index 626a1f72f5fb7..50cf8af30d606 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java @@ -5,7 +5,7 @@ import com.linkedin.common.urn.Urn; import com.linkedin.entity.Aspect; -import com.linkedin.metadata.aspect.CachingAspectRetriever; +import com.linkedin.metadata.aspect.AspectRetriever; import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.models.registry.EntityRegistry; import io.datahubproject.metadata.context.OperationContext; @@ -22,7 +22,7 @@ @Getter @Builder -public class EntityServiceAspectRetriever implements CachingAspectRetriever { +public class EntityServiceAspectRetriever implements AspectRetriever { @Setter private OperationContext systemOperationContext; private final EntityRegistry entityRegistry; @@ -46,7 +46,8 @@ public Map> getLatestAspectObjects( String entityName = urns.stream().findFirst().map(Urn::getEntityType).get(); try { return entityResponseToAspectMap( - entityService.getEntitiesV2(systemOperationContext, entityName, urns, aspectNames)); + entityService.getEntitiesV2( + systemOperationContext, entityName, urns, aspectNames, false)); } catch (URISyntaxException e) { throw new RuntimeException(e); } @@ -71,7 +72,8 @@ public Map> getLatestSystemAspects( urnAspectNames.keySet(), urnAspectNames.values().stream() .flatMap(Collection::stream) - .collect(Collectors.toSet())), + .collect(Collectors.toSet()), + false), entityRegistry); } catch (URISyntaxException e) { throw new RuntimeException(e); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java index 6de7784bfbc0e..8ae09111204ca 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java @@ -261,8 +261,7 @@ public Map> getLatestAspects( } List systemAspects = - EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), batchGetResults.values()); + EntityUtils.toSystemAspects(opContext.getRetrieverContext(), batchGetResults.values()); systemAspects.stream() // for now, don't add the key aspect here we have already added it above @@ -290,8 +289,7 @@ public Map getLatestAspectsForUrn( Map batchGetResults = getLatestAspect(opContext, new HashSet<>(Arrays.asList(urn)), aspectNames, forUpdate); - return EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), batchGetResults.values()) + return EntityUtils.toSystemAspects(opContext.getRetrieverContext(), batchGetResults.values()) .stream() .map( systemAspect -> Pair.of(systemAspect.getAspectName(), systemAspect.getRecordTemplate())) @@ -335,7 +333,7 @@ public Pair getAspectVersionPair( final Optional maybeAspect = Optional.ofNullable(aspectDao.getAspect(primaryKey)); return Pair.of( - EntityUtils.toSystemAspect(opContext.getRetrieverContext().get(), maybeAspect.orElse(null)) + EntityUtils.toSystemAspect(opContext.getRetrieverContext(), maybeAspect.orElse(null)) .map(SystemAspect::getRecordTemplate) .orElse(null), version); @@ -721,7 +719,7 @@ public ListResult listLatestAspects( } return new ListResult<>( - EntityUtils.toSystemAspects(opContext.getRetrieverContext().get(), entityAspects).stream() + EntityUtils.toSystemAspects(opContext.getRetrieverContext(), entityAspects).stream() .map(SystemAspect::getRecordTemplate) .collect(Collectors.toList()), aspectMetadataList.getMetadata(), @@ -758,12 +756,12 @@ public List ingestAspects( .recordTemplate(pair.getValue()) .systemMetadata(systemMetadata) .auditStamp(auditStamp) - .build(opContext.getAspectRetrieverOpt().get())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList()); return ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -815,13 +813,13 @@ private void processPostCommitMCLSideEffects( log.debug("Considering {} MCLs post commit side effects.", mcls.size()); List batch = mcls.stream() - .map(mcl -> MCLItemImpl.builder().build(mcl, opContext.getAspectRetrieverOpt().get())) + .map(mcl -> MCLItemImpl.builder().build(mcl, opContext.getAspectRetriever())) .collect(Collectors.toList()); Iterable> iterable = () -> Iterators.partition( - AspectsBatch.applyPostMCPSideEffects(batch, opContext.getRetrieverContext().get()) + AspectsBatch.applyPostMCPSideEffects(batch, opContext.getRetrieverContext()) .iterator(), MCP_SIDE_EFFECT_KAFKA_BATCH_SIZE); StreamSupport.stream(iterable.spliterator(), false) @@ -831,7 +829,7 @@ private void processPostCommitMCLSideEffects( ingestProposalAsync( AspectsBatchImpl.builder() .items(sideEffects) - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .build()) .count(); log.info("Generated {} MCP SideEffects for async processing", count); @@ -879,8 +877,7 @@ private List ingestAspectsToLocalDB( aspectDao.getLatestAspects(urnAspects, true); final Map> batchAspects = - EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), databaseAspects); + EntityUtils.toSystemAspects(opContext.getRetrieverContext(), databaseAspects); // read #2 (potentially) final Map> nextVersions = @@ -903,7 +900,7 @@ private List ingestAspectsToLocalDB( Map> newLatestAspects = EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), aspectDao.getLatestAspects(updatedItems.getFirst(), true)); // merge updatedLatestAspects = AspectsBatch.merge(batchAspects, newLatestAspects); @@ -941,7 +938,7 @@ private List ingestAspectsToLocalDB( // do final pre-commit checks with previous aspect value ValidationExceptionCollection exceptions = - AspectsBatch.validatePreCommit(changeMCPs, opContext.getRetrieverContext().get()); + AspectsBatch.validatePreCommit(changeMCPs, opContext.getRetrieverContext()); if (exceptions.hasFatalExceptions()) { // IF this is a client request/API request we fail the `transaction batch` @@ -1143,8 +1140,8 @@ public RecordTemplate ingestAspectIfNotPresent( .recordTemplate(newValue) .systemMetadata(systemMetadata) .auditStamp(auditStamp) - .build(opContext.getAspectRetrieverOpt().get()), - opContext.getRetrieverContext().get()) + .build(opContext.getAspectRetriever()), + opContext.getRetrieverContext()) .build(); List ingested = ingestAspects(opContext, aspectsBatch, true, false); @@ -1169,7 +1166,7 @@ public IngestResult ingestProposal( return ingestProposal( opContext, AspectsBatchImpl.builder() - .mcps(List.of(proposal), auditStamp, opContext.getRetrieverContext().get()) + .mcps(List.of(proposal), auditStamp, opContext.getRetrieverContext()) .build(), async) .stream() @@ -1246,7 +1243,7 @@ private Stream ingestTimeseriesProposal( .recordTemplate( EntityApiUtils.buildKeyAspect( opContext.getEntityRegistry(), item.getUrn())) - .build(opContext.getAspectRetrieverOpt().get())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList()); ingestProposalSync( @@ -1469,7 +1466,7 @@ public List restoreIndices( List systemAspects = EntityUtils.toSystemAspectFromEbeanAspects( - opContext.getRetrieverContext().get(), batch.collect(Collectors.toList())); + opContext.getRetrieverContext(), batch.collect(Collectors.toList())); RestoreIndicesResult result = restoreIndices(opContext, systemAspects, logger); result.timeSqlQueryMs = timeSqlQueryMs; @@ -1513,7 +1510,7 @@ public List restoreIndices( long startTime = System.currentTimeMillis(); List systemAspects = EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), getLatestAspect(opContext, entityBatch.getValue(), aspectNames, false).values()); long timeSqlQueryMs = System.currentTimeMillis() - startTime; @@ -1649,12 +1646,12 @@ private RestoreIndicesResult restoreIndices( .auditStamp(auditStamp) .systemMetadata(latestSystemMetadata) .recordTemplate(EntityApiUtils.buildKeyAspect(opContext.getEntityRegistry(), urn)) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); Stream defaultAspectsResult = ingestProposalSync( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(keyAspect) .build()); defaultAspectsCreated += defaultAspectsResult.count(); @@ -1966,7 +1963,7 @@ private void ingestSnapshotUnion( AspectsBatchImpl aspectsBatch = AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items( aspectRecordsToIngest.stream() .map( @@ -1977,7 +1974,7 @@ private void ingestSnapshotUnion( .recordTemplate(pair.getValue()) .auditStamp(auditStamp) .systemMetadata(systemMetadata) - .build(opContext.getAspectRetrieverOpt().get())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList())) .build(); @@ -2128,7 +2125,7 @@ public RollbackRunResult deleteUrn(@Nonnull OperationContext opContext, Urn urn) } SystemMetadata latestKeySystemMetadata = - EntityUtils.toSystemAspect(opContext.getRetrieverContext().get(), latestKey) + EntityUtils.toSystemAspect(opContext.getRetrieverContext(), latestKey) .map(SystemAspect::getSystemMetadata) .get(); RollbackResult result = @@ -2253,11 +2250,11 @@ private RollbackResult deleteAspectWithoutMCL( .urn(entityUrn) .aspectName(aspectName) .auditStamp(auditStamp) - .build(opContext.getAspectRetrieverOpt().get()); + .build(opContext.getAspectRetriever()); // Delete validation hooks ValidationExceptionCollection exceptions = - AspectsBatch.validateProposed(List.of(deleteItem), opContext.getRetrieverContext().get()); + AspectsBatch.validateProposed(List.of(deleteItem), opContext.getRetrieverContext()); if (!exceptions.isEmpty()) { throw new ValidationException(collectMetrics(exceptions).toString()); } @@ -2271,7 +2268,7 @@ private RollbackResult deleteAspectWithoutMCL( final EntityAspect.EntitySystemAspect latest = (EntityAspect.EntitySystemAspect) EntityUtils.toSystemAspect( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), aspectDao.getLatestAspect(urn, aspectName, false)) .orElse(null); @@ -2299,7 +2296,7 @@ private RollbackResult deleteAspectWithoutMCL( EntityAspect.EntitySystemAspect candidateAspect = (EntityAspect.EntitySystemAspect) EntityUtils.toSystemAspect( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), aspectDao.getAspect(urn, aspectName, maxVersion)) .orElse(null); SystemMetadata previousSysMetadata = @@ -2325,13 +2322,9 @@ private RollbackResult deleteAspectWithoutMCL( .urn(UrnUtils.getUrn(toDelete.getUrn())) .aspectName(toDelete.getAspect()) .auditStamp(auditStamp) - .build( - opContext - .getRetrieverContext() - .get() - .getAspectRetriever())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList()), - opContext.getRetrieverContext().get()); + opContext.getRetrieverContext()); if (!preCommitExceptions.isEmpty()) { throw new ValidationException(collectMetrics(preCommitExceptions).toString()); } @@ -2509,7 +2502,7 @@ private Map getEnvelopedAspects( final Map dbEntries = aspectDao.batchGet(dbKeys, false); List envelopedAspects = - EntityUtils.toSystemAspects(opContext.getRetrieverContext().get(), dbEntries.values()); + EntityUtils.toSystemAspects(opContext.getRetrieverContext(), dbEntries.values()); return envelopedAspects.stream() .collect( diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java index 3c4109970e9d0..da48a2b76d6d5 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java @@ -72,7 +72,7 @@ public static void ingestChangeProposals( entityService.ingestProposal( opContext, AspectsBatchImpl.builder() - .mcps(changes, getAuditStamp(actor), opContext.getRetrieverContext().get()) + .mcps(changes, getAuditStamp(actor), opContext.getRetrieverContext()) .build(), async); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java index ccc1910ba5cdb..c595e3e07b834 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java @@ -64,7 +64,7 @@ protected AspectsBatch buildAspectsBatch( List mcps, @Nonnull AuditStamp auditStamp) { return AspectsBatchImpl.builder() - .mcps(mcps, auditStamp, opContext.getRetrieverContext().get()) + .mcps(mcps, auditStamp, opContext.getRetrieverContext()) .build(); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java index 49fa555e006f6..74d0d8b0964de 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java @@ -59,7 +59,7 @@ protected AspectsBatch buildAspectsBatch( List mcps, @Nonnull AuditStamp auditStamp) { return AspectsBatchImpl.builder() - .mcps(mcps, auditStamp, opContext.getRetrieverContext().get()) + .mcps(mcps, auditStamp, opContext.getRetrieverContext()) .build(); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java index 367705d369c7c..6c5c6243d3362 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java @@ -143,7 +143,7 @@ private static QueryBuilder expandTerms( if (!queryUrns.isEmpty()) { scrollGraph( - opContext.getRetrieverContext().get().getGraphRetriever(), + opContext.getRetrieverContext().getGraphRetriever(), queryUrns, relationshipTypes, relationshipDirection, diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java b/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java index 4bb8e0630de48..b4ad847cb7afc 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java @@ -437,8 +437,6 @@ private void setStructuredPropertiesSearchValue( Map> definitions = opContext - .getRetrieverContext() - .get() .getAspectRetriever() .getLatestAspectObjects( propertyMap.keySet(), Set.of(STRUCTURED_PROPERTY_DEFINITION_ASPECT_NAME)); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java index ad2825ead3d0d..4a692e9534622 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java @@ -112,7 +112,7 @@ private void fetchRelatedEntities( @Nullable String scrollId, int consumedEntityCount, int batchNumber) { - GraphRetriever graph = opContext.getRetrieverContext().get().getGraphRetriever(); + GraphRetriever graph = opContext.getRetrieverContext().getGraphRetriever(); final ArrayList> futureList = new ArrayList<>(); RelatedEntitiesScrollResult result = graph.scrollRelatedEntities( @@ -165,7 +165,7 @@ private Callable processBatch( return () -> { StopWatch stopWatch = new StopWatch(); stopWatch.start(); - AspectRetriever aspectRetriever = opContext.getAspectRetrieverOpt().get(); + AspectRetriever aspectRetriever = opContext.getAspectRetriever(); log.info("Batch {} for BA:{} started", batchNumber, entityKey); ExecutionResult executionResult = new ExecutionResult(); executionResult.setBatchNumber(batchNumber); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java index efe073fc00dfd..4b09bc00efb61 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java @@ -94,8 +94,7 @@ public UpdateGraphIndicesService( public void handleChangeEvent( @Nonnull OperationContext opContext, @Nonnull final MetadataChangeLog event) { try { - MCLItemImpl mclItem = - MCLItemImpl.builder().build(event, opContext.getAspectRetrieverOpt().get()); + MCLItemImpl mclItem = MCLItemImpl.builder().build(event, opContext.getAspectRetriever()); if (UPDATE_CHANGE_TYPES.contains(event.getChangeType())) { handleUpdateChangeEvent(opContext, mclItem); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java index 187ef3e8c6229..c5fc9ebdac9fa 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java @@ -121,11 +121,10 @@ public UpdateIndicesService( public void handleChangeEvent( @Nonnull OperationContext opContext, @Nonnull final MetadataChangeLog event) { try { - MCLItemImpl batch = - MCLItemImpl.builder().build(event, opContext.getAspectRetrieverOpt().get()); + MCLItemImpl batch = MCLItemImpl.builder().build(event, opContext.getAspectRetriever()); Stream sideEffects = - AspectsBatch.applyMCLSideEffects(List.of(batch), opContext.getRetrieverContext().get()); + AspectsBatch.applyMCLSideEffects(List.of(batch), opContext.getRetrieverContext()); for (MCLItem mclItem : Stream.concat(Stream.of(batch), sideEffects).collect(Collectors.toList())) { diff --git a/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java b/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java index 12b12cf105196..fa6ab7932001b 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java @@ -46,12 +46,12 @@ public static Map ingestCorpUserKeyAspects( .recordTemplate(aspect) .auditStamp(AspectGenerationUtils.createAuditStamp()) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); } entityService.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -83,12 +83,12 @@ public static Map ingestCorpUserInfoAspects( .recordTemplate(aspect) .auditStamp(AspectGenerationUtils.createAuditStamp()) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); } entityService.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -121,12 +121,12 @@ public static Map ingestChartInfoAspects( .recordTemplate(aspect) .auditStamp(AspectGenerationUtils.createAuditStamp()) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); } entityService.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java b/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java index 11a3153abcaee..19be1eb14667d 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java @@ -16,7 +16,8 @@ import com.linkedin.data.template.StringMap; import com.linkedin.dataset.DatasetProperties; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; import com.linkedin.metadata.entity.SearchRetriever; @@ -28,7 +29,6 @@ import com.linkedin.mxe.SystemMetadata; import com.linkedin.test.metadata.aspect.TestEntityRegistry; import io.datahubproject.metadata.context.RetrieverContext; -import io.datahubproject.test.metadata.context.TestOperationContexts; import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; import java.util.List; @@ -53,17 +53,17 @@ public class IgnoreUnknownMutatorTest { private static final Urn TEST_DATASET_URN = UrnUtils.getUrn( "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.customers,PROD)"); - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private RetrieverContext retrieverContext; @BeforeMethod public void setup() { - mockAspectRetriever = mock(AspectRetriever.class); + mockAspectRetriever = mock(CachingAspectRetriever.class); retrieverContext = RetrieverContext.builder() .searchRetriever(mock(SearchRetriever.class)) - .aspectRetriever(mockAspectRetriever) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) + .cachingAspectRetriever(mockAspectRetriever) + .graphRetriever(GraphRetriever.EMPTY) .build(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java b/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java index 04aff4edf456d..e7ed267113159 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java @@ -56,8 +56,7 @@ public void testAdditionalChanges() { DefaultAspectsUtil.getAdditionalChanges( opContext, AspectsBatchImpl.builder() - .mcps( - List.of(proposal1), new AuditStamp(), opContext.getRetrieverContext().get()) + .mcps(List.of(proposal1), new AuditStamp(), opContext.getRetrieverContext()) .build() .getMCPItems(), entityServiceImpl, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java b/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java index 976b165fea53d..215e1e2431efa 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java @@ -15,7 +15,7 @@ import com.linkedin.dataproduct.DataProductAssociationArray; import com.linkedin.dataproduct.DataProductProperties; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.aspect.batch.MCPItem; @@ -75,12 +75,12 @@ public class DataProductUnsetSideEffectTest { .build())) .build(); - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private RetrieverContext retrieverContext; @BeforeMethod public void setup() { - mockAspectRetriever = mock(AspectRetriever.class); + mockAspectRetriever = mock(CachingAspectRetriever.class); when(mockAspectRetriever.getEntityRegistry()).thenReturn(TEST_REGISTRY); GraphRetriever graphRetriever = mock(GraphRetriever.class); RelatedEntities relatedEntities = @@ -139,7 +139,7 @@ public void setup() { retrieverContext = RetrieverContext.builder() .searchRetriever(mock(SearchRetriever.class)) - .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever(mockAspectRetriever) .graphRetriever(graphRetriever) .build(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java index 0386031cbcad8..88f84ee94c8ee 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java @@ -19,6 +19,7 @@ import com.linkedin.metadata.AspectGenerationUtils; import com.linkedin.metadata.Constants; import com.linkedin.metadata.EbeanTestUtils; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.config.EbeanConfiguration; import com.linkedin.metadata.config.PreProcessHooks; import com.linkedin.metadata.entity.ebean.EbeanAspectDao; @@ -98,12 +99,15 @@ public void setupTest() { .entityService(_entityServiceImpl) .entityRegistry(_testEntityRegistry) .build()) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) - .searchRetriever(TestOperationContexts.emptySearchRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> _testEntityRegistry)) + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) .build(), null, opContext -> - ((EntityServiceAspectRetriever) opContext.getAspectRetrieverOpt().get()) + ((EntityServiceAspectRetriever) opContext.getAspectRetriever()) .setSystemOperationContext(opContext), null); } @@ -152,25 +156,25 @@ public void testIngestListLatestAspects() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)), + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)), ChangeItemImpl.builder() .urn(entityUrn2) .aspectName(aspectName) .recordTemplate(writeAspect2) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)), + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)), ChangeItemImpl.builder() .urn(entityUrn3) .aspectName(aspectName) .recordTemplate(writeAspect3) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null))); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null))); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -230,25 +234,25 @@ public void testIngestListUrns() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)), + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)), ChangeItemImpl.builder() .urn(entityUrn2) .aspectName(aspectName) .recordTemplate(writeAspect2) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)), + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)), ChangeItemImpl.builder() .urn(entityUrn3) .aspectName(aspectName) .recordTemplate(writeAspect3) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null))); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null))); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -310,11 +314,11 @@ public void testSystemMetadataDuplicateKey() throws Exception { .recordTemplate(new Status().setRemoved(true)) .systemMetadata(systemMetadata) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(item)) .build(), false, @@ -356,7 +360,7 @@ public void testSystemMetadataDuplicateKey() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items( List.of( ChangeItemImpl.builder() @@ -365,7 +369,7 @@ public void testSystemMetadataDuplicateKey() throws Exception { .recordTemplate(new Status().setRemoved(false)) .systemMetadata(systemMetadata) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)))) + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)))) .build(), false, true); @@ -600,7 +604,7 @@ public void run() { auditStamp.setTime(System.currentTimeMillis()); AspectsBatchImpl batch = AspectsBatchImpl.builder() - .mcps(mcps, auditStamp, operationContext.getRetrieverContext().get()) + .mcps(mcps, auditStamp, operationContext.getRetrieverContext()) .build(); entityService.ingestProposal(operationContext, batch, false); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java index 2d59632e6f3c6..c00632e5cf542 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java @@ -945,32 +945,32 @@ public void testRollbackAspect() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn2) .aspectName(aspectName) .recordTemplate(writeAspect2) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn3) .aspectName(aspectName) .recordTemplate(writeAspect3) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(aspectName) .recordTemplate(writeAspect1Overwrite) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1037,25 +1037,25 @@ public void testRollbackKey() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(keyAspectName) .recordTemplate(writeKey1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(aspectName) .recordTemplate(writeAspect1Overwrite) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1130,39 +1130,39 @@ public void testRollbackUrn() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(keyAspectName) .recordTemplate(writeKey1) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn2) .aspectName(aspectName) .recordTemplate(writeAspect2) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn3) .aspectName(aspectName) .recordTemplate(writeAspect3) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(aspectName) .recordTemplate(writeAspect1Overwrite) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1208,11 +1208,11 @@ public void testIngestGetLatestAspect() throws AssertionError { .recordTemplate(writeAspect1) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1264,11 +1264,11 @@ public void testIngestGetLatestAspect() throws AssertionError { .recordTemplate(writeAspect2) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata2) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1320,11 +1320,11 @@ public void testIngestGetLatestEnvelopedAspect() throws Exception { .recordTemplate(writeAspect1) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1347,11 +1347,11 @@ public void testIngestGetLatestEnvelopedAspect() throws Exception { .recordTemplate(writeAspect2) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1416,11 +1416,11 @@ public void testIngestSameAspect() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1472,11 +1472,11 @@ public void testIngestSameAspect() throws AssertionError { .recordTemplate(writeAspect2) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1534,46 +1534,46 @@ public void testRetention() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName) .recordTemplate(writeAspect1a) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName) .recordTemplate(writeAspect1b) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName2) .recordTemplate(writeAspect2) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName2) .recordTemplate(writeAspect2a) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName2) .recordTemplate(writeAspect2b) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1610,18 +1610,18 @@ public void testRetention() throws AssertionError { .recordTemplate(writeAspect1c) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName2) .recordTemplate(writeAspect2c) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1982,8 +1982,7 @@ public void testStructuredPropertyIngestProposal() throws Exception { stream .map( entityAspect -> - EntityUtils.toSystemAspect( - opContext.getRetrieverContext().get(), entityAspect) + EntityUtils.toSystemAspect(opContext.getRetrieverContext(), entityAspect) .get() .getAspect(StructuredPropertyDefinition.class)) .collect(Collectors.toSet()); @@ -1995,7 +1994,10 @@ public void testStructuredPropertyIngestProposal() throws Exception { SystemEntityClient mockSystemEntityClient = Mockito.mock(SystemEntityClient.class); Mockito.when( mockSystemEntityClient.getLatestAspectObject( - any(OperationContext.class), eq(firstPropertyUrn), eq("propertyDefinition"))) + any(OperationContext.class), + eq(firstPropertyUrn), + eq("propertyDefinition"), + anyBoolean())) .thenReturn(new com.linkedin.entity.Aspect(structuredPropertyDefinition.data())); // Add a value for that property @@ -2062,8 +2064,7 @@ public void testStructuredPropertyIngestProposal() throws Exception { stream .map( entityAspect -> - EntityUtils.toSystemAspect( - opContext.getRetrieverContext().get(), entityAspect) + EntityUtils.toSystemAspect(opContext.getRetrieverContext(), entityAspect) .get() .getAspect(StructuredPropertyDefinition.class)) .collect(Collectors.toSet()); @@ -2074,7 +2075,10 @@ public void testStructuredPropertyIngestProposal() throws Exception { Mockito.when( mockSystemEntityClient.getLatestAspectObject( - any(OperationContext.class), eq(secondPropertyUrn), eq("propertyDefinition"))) + any(OperationContext.class), + eq(secondPropertyUrn), + eq("propertyDefinition"), + anyBoolean())) .thenReturn(new com.linkedin.entity.Aspect(secondDefinition.data())); // Get existing value for first structured property @@ -2209,7 +2213,7 @@ public void testBatchDuplicate() throws Exception { .recordTemplate(new Status().setRemoved(true)) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); ChangeItemImpl item2 = ChangeItemImpl.builder() .urn(entityUrn) @@ -2217,11 +2221,11 @@ public void testBatchDuplicate() throws Exception { .recordTemplate(new Status().setRemoved(false)) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(item1, item2)) .build(), false, @@ -2269,7 +2273,7 @@ public void testBatchPatchWithTrailingNoOp() throws Exception { .setTags(new TagAssociationArray(new TagAssociation().setTag(tag1)))) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); PatchItemImpl patchAdd2 = PatchItemImpl.builder() @@ -2311,7 +2315,7 @@ public void testBatchPatchWithTrailingNoOp() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(initialAspectTag1)) .build(), false, @@ -2320,7 +2324,7 @@ public void testBatchPatchWithTrailingNoOp() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchAdd2, patchRemoveNonExistent)) .build(), false, @@ -2368,7 +2372,7 @@ public void testBatchPatchAdd() throws Exception { .setTags(new TagAssociationArray(new TagAssociation().setTag(tag1)))) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); PatchItemImpl patchAdd3 = PatchItemImpl.builder() @@ -2428,7 +2432,7 @@ public void testBatchPatchAdd() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(initialAspectTag1)) .build(), false, @@ -2437,7 +2441,7 @@ public void testBatchPatchAdd() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchAdd3, patchAdd2, patchAdd1)) .build(), false, @@ -2491,7 +2495,7 @@ public void testBatchPatchAddDuplicate() throws Exception { .recordTemplate(new GlobalTags().setTags(new TagAssociationArray(initialTags))) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); PatchItemImpl patchAdd2 = PatchItemImpl.builder() @@ -2516,7 +2520,7 @@ public void testBatchPatchAddDuplicate() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(initialAspectTag1)) .build(), false, @@ -2525,7 +2529,7 @@ public void testBatchPatchAddDuplicate() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchAdd2, patchAdd2)) // duplicate .build(), false, @@ -2581,7 +2585,7 @@ public void testPatchRemoveNonExistent() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchRemove)) .build(), false, @@ -2638,7 +2642,7 @@ public void testPatchAddNonExistent() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchAdd)) .build(), false, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java index 550f55e6bfd0b..b4fbfecc9d60d 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java @@ -10,11 +10,13 @@ import com.linkedin.metadata.AspectGenerationUtils; import com.linkedin.metadata.AspectIngestionUtils; import com.linkedin.metadata.CassandraTestUtils; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.config.PreProcessHooks; import com.linkedin.metadata.entity.EntityServiceAspectRetriever; import com.linkedin.metadata.entity.EntityServiceImpl; import com.linkedin.metadata.entity.EntityServiceTest; import com.linkedin.metadata.entity.ListResult; +import com.linkedin.metadata.entity.SearchRetriever; import com.linkedin.metadata.event.EventProducer; import com.linkedin.metadata.key.CorpUserKey; import com.linkedin.metadata.models.registry.EntityRegistryException; @@ -93,12 +95,15 @@ private void configureComponents() { .entityService(_entityServiceImpl) .entityRegistry(_testEntityRegistry) .build()) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) - .searchRetriever(TestOperationContexts.emptySearchRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> _testEntityRegistry)) + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) .build(), null, opContext -> - ((EntityServiceAspectRetriever) opContext.getAspectRetrieverOpt().get()) + ((EntityServiceAspectRetriever) opContext.getAspectRetriever()) .setSystemOperationContext(opContext), null); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java index 3f6b301e72aa5..0a867ae3c8f2e 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java @@ -26,7 +26,7 @@ public void testBatchDuplicate() throws Exception { .recordTemplate(new Status().setRemoved(true)) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); ChangeItemImpl item2 = ChangeItemImpl.builder() .urn(entityUrn) @@ -34,7 +34,7 @@ public void testBatchDuplicate() throws Exception { .recordTemplate(new Status().setRemoved(false)) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); assertFalse(item1.isDatabaseDuplicateOf(item2)); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java index ca42f0327c86d..8f68f119cb0b7 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java @@ -11,6 +11,7 @@ import com.linkedin.metadata.recommendation.ranker.SimpleRecommendationRanker; import io.datahubproject.test.metadata.context.TestOperationContexts; import java.net.URISyntaxException; +import java.nio.file.AccessDeniedException; import java.util.List; import java.util.stream.Collectors; import org.testng.annotations.Test; @@ -74,7 +75,7 @@ private List getContentFromUrns(List urns) { } @Test - public void testService() throws URISyntaxException { + public void testService() throws URISyntaxException, AccessDeniedException { // Test non-eligible and empty RecommendationsService service = new RecommendationsService(ImmutableList.of(nonEligibleSource, emptySource), ranker); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java b/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java index 1661f5f02ee59..fa895cb454011 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java @@ -21,7 +21,8 @@ import com.linkedin.data.ByteString; import com.linkedin.entity.Aspect; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCLItem; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; @@ -46,7 +47,6 @@ import com.linkedin.test.metadata.aspect.TestEntityRegistry; import com.linkedin.test.metadata.aspect.batch.TestMCP; import io.datahubproject.metadata.context.RetrieverContext; -import io.datahubproject.test.metadata.context.TestOperationContexts; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; @@ -87,18 +87,18 @@ public class SchemaFieldSideEffectTest { .build())) .build(); - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private RetrieverContext retrieverContext; @BeforeMethod public void setup() { - mockAspectRetriever = mock(AspectRetriever.class); + mockAspectRetriever = mock(CachingAspectRetriever.class); when(mockAspectRetriever.getEntityRegistry()).thenReturn(TEST_REGISTRY); retrieverContext = RetrieverContext.builder() .searchRetriever(mock(SearchRetriever.class)) - .aspectRetriever(mockAspectRetriever) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) + .cachingAspectRetriever(mockAspectRetriever) + .graphRetriever(GraphRetriever.EMPTY) .build(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java index fd768424e13c1..1825b65a18ab1 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java @@ -20,6 +20,7 @@ import com.linkedin.metadata.aspect.models.graph.RelatedEntities; import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult; import com.linkedin.metadata.config.search.QueryFilterRewriterConfiguration; +import com.linkedin.metadata.entity.SearchRetriever; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.Condition; @@ -71,8 +72,10 @@ public void init() { () -> io.datahubproject.metadata.context.RetrieverContext.builder() .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever(() -> entityRegistry)) .graphRetriever(mockGraphRetriever) - .searchRetriever(TestOperationContexts.emptySearchRetriever) + .searchRetriever(SearchRetriever.EMPTY) .build(), null, null, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java index 8741e24b1bca5..de375271ed660 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java @@ -13,13 +13,14 @@ import static org.mockito.Mockito.when; import static org.testng.Assert.assertEquals; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.RetrieverContext; import com.linkedin.metadata.aspect.models.graph.Edge; import com.linkedin.metadata.aspect.models.graph.RelatedEntities; import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult; import com.linkedin.metadata.config.search.QueryFilterRewriterConfiguration; +import com.linkedin.metadata.entity.SearchRetriever; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.Condition; @@ -54,7 +55,7 @@ public class DomainExpansionRewriterTest @BeforeMethod public void init() { EntityRegistry entityRegistry = new TestEntityRegistry(); - AspectRetriever mockAspectRetriever = mock(AspectRetriever.class); + CachingAspectRetriever mockAspectRetriever = mock(CachingAspectRetriever.class); when(mockAspectRetriever.getEntityRegistry()).thenReturn(entityRegistry); mockGraphRetriever = spy(GraphRetriever.class); @@ -71,8 +72,10 @@ public void init() { () -> io.datahubproject.metadata.context.RetrieverContext.builder() .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever(() -> entityRegistry)) .graphRetriever(mockGraphRetriever) - .searchRetriever(TestOperationContexts.emptySearchRetriever) + .searchRetriever(SearchRetriever.EMPTY) .build(), null, null, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java index c68997e25bcff..d6f5f9c3eedbe 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java @@ -18,6 +18,7 @@ import com.linkedin.data.template.StringArray; import com.linkedin.entity.Aspect; import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.config.search.SearchConfiguration; import com.linkedin.metadata.models.EntitySpec; import com.linkedin.metadata.models.annotation.SearchableAnnotation; @@ -49,8 +50,8 @@ public class AggregationQueryBuilderTest { - private static AspectRetriever aspectRetriever; - private static AspectRetriever aspectRetrieverV1; + private static CachingAspectRetriever aspectRetriever; + private static CachingAspectRetriever aspectRetrieverV1; private static String DEFAULT_FILTER = "_index"; @BeforeClass @@ -61,7 +62,7 @@ public void setup() throws RemoteInvocationException, URISyntaxException { Urn.createFromString("urn:li:structuredProperty:under.scores.and.dots_make_a_mess"); // legacy - aspectRetriever = mock(AspectRetriever.class); + aspectRetriever = mock(CachingAspectRetriever.class); when(aspectRetriever.getEntityRegistry()) .thenReturn(TestOperationContexts.defaultEntityRegistry()); @@ -106,7 +107,7 @@ public void setup() throws RemoteInvocationException, URISyntaxException { new Aspect(structPropUnderscoresAndDotsDefinition.data())))); // V1 - aspectRetrieverV1 = mock(AspectRetriever.class); + aspectRetrieverV1 = mock(CachingAspectRetriever.class); when(aspectRetrieverV1.getEntityRegistry()) .thenReturn(TestOperationContexts.defaultEntityRegistry()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java index 393ca3ca5d4a6..e51511699e345 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java @@ -662,6 +662,7 @@ public void testInvalidStructuredProperty() { TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever(TestOperationContexts.emptyActiveUsersAspectRetriever(null)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java index 2c5bcd1294fa1..65b73b7425b74 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java @@ -247,6 +247,9 @@ public void testSetSearchableRefValue() throws URISyntaxException, RemoteInvocat TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> TEST_ENTITY_REGISTRY)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); @@ -301,6 +304,9 @@ public void testSetSearchableRefValue_RuntimeException() TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> TEST_ENTITY_REGISTRY)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); @@ -337,6 +343,9 @@ public void testSetSearchableRefValue_RuntimeException_URNExist() TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> TEST_ENTITY_REGISTRY)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); @@ -369,6 +378,9 @@ void testSetSearchableRefValue_WithInvalidURN() TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> TEST_ENTITY_REGISTRY)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java index b1b716c560481..9a0a82c7f9f49 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java @@ -18,7 +18,8 @@ import com.linkedin.common.urn.UrnUtils; import com.linkedin.entity.Aspect; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.batch.PatchMCP; import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; @@ -36,7 +37,6 @@ import com.linkedin.test.metadata.aspect.TestEntityRegistry; import com.linkedin.test.metadata.aspect.batch.TestMCL; import io.datahubproject.metadata.context.RetrieverContext; -import io.datahubproject.test.metadata.context.TestOperationContexts; import jakarta.json.Json; import jakarta.json.JsonPatch; import java.util.List; @@ -76,13 +76,13 @@ public class PropertyDefinitionDeleteSideEffectTest { private static final Urn TEST_DATASET_URN = UrnUtils.getUrn( "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.customers,PROD)"); - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private SearchRetriever mockSearchRetriever; private RetrieverContext retrieverContext; @BeforeMethod public void setup() { - mockAspectRetriever = mock(AspectRetriever.class); + mockAspectRetriever = mock(CachingAspectRetriever.class); when(mockAspectRetriever.getEntityRegistry()).thenReturn(TEST_REGISTRY); when(mockAspectRetriever.getLatestAspectObject( eq(TEST_PROPERTY_URN), eq(STRUCTURED_PROPERTY_DEFINITION_ASPECT_NAME))) @@ -101,8 +101,8 @@ public void setup() { retrieverContext = RetrieverContext.builder() .searchRetriever(mockSearchRetriever) - .aspectRetriever(mockAspectRetriever) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) + .cachingAspectRetriever(mockAspectRetriever) + .graphRetriever(GraphRetriever.EMPTY) .build(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java index 2503faa00f6e7..6e8886f495c95 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java @@ -58,7 +58,7 @@ public void setup() { mockGraphRetriever = Mockito.mock(GraphRetriever.class); retrieverContext = io.datahubproject.metadata.context.RetrieverContext.builder() - .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever(mockAspectRetriever) .searchRetriever(mockSearchRetriever) .graphRetriever(mockGraphRetriever) .build(); diff --git a/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java b/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java index 3acd2bf341357..02cd28eb202e9 100644 --- a/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java +++ b/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java @@ -171,10 +171,7 @@ public Stream> generateMCPs( DefaultAspectsUtil.getAdditionalChanges( opContext, AspectsBatchImpl.builder() - .mcps( - List.of(mcp), - auditStamp, - opContext.getRetrieverContext().get()) + .mcps(List.of(mcp), auditStamp, opContext.getRetrieverContext()) .build() .getMCPItems(), entityService, diff --git a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java index cf9d73dfa729b..f16c9dbd82e74 100644 --- a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java +++ b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java @@ -20,7 +20,6 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.OperationContextConfig; -import io.datahubproject.metadata.context.RetrieverContext; import io.datahubproject.metadata.context.ServicesRegistryContext; import io.datahubproject.metadata.context.ValidationContext; import io.datahubproject.test.metadata.context.TestOperationContexts; @@ -95,7 +94,7 @@ public OperationContext operationContext( entityRegistry, mock(ServicesRegistryContext.class), indexConvention, - mock(RetrieverContext.class), + TestOperationContexts.emptyActiveUsersRetrieverContext(() -> entityRegistry), mock(ValidationContext.class)); } diff --git a/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java b/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java index 47740b02d6166..65ee6b8591f48 100644 --- a/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java +++ b/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java @@ -93,8 +93,6 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception { new RelatedEntity(BUSINESS_ATTRIBUTE_OF, SCHEMA_FIELD_URN.toString()))); when(opContext - .getRetrieverContext() - .get() .getAspectRetriever() .getLatestAspectObjects( eq(Set.of(SCHEMA_FIELD_URN)), eq(Set.of(BUSINESS_ATTRIBUTE_ASPECT)))) @@ -108,7 +106,7 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception { // verify // page 1 - Mockito.verify(opContext.getRetrieverContext().get().getGraphRetriever(), Mockito.times(1)) + Mockito.verify(opContext.getRetrieverContext().getGraphRetriever(), Mockito.times(1)) .scrollRelatedEntities( isNull(), any(Filter.class), @@ -122,7 +120,7 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception { isNull(), isNull()); // page 2 - Mockito.verify(opContext.getRetrieverContext().get().getGraphRetriever(), Mockito.times(1)) + Mockito.verify(opContext.getRetrieverContext().getGraphRetriever(), Mockito.times(1)) .scrollRelatedEntities( isNull(), any(Filter.class), @@ -136,7 +134,7 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception { isNull(), isNull()); - Mockito.verifyNoMoreInteractions(opContext.getRetrieverContext().get().getGraphRetriever()); + Mockito.verifyNoMoreInteractions(opContext.getRetrieverContext().getGraphRetriever()); // 2 pages = 2 ingest proposals Mockito.verify(mockUpdateIndicesService, Mockito.times(2)) @@ -152,8 +150,8 @@ private void testMCLOnInvalidCategory() throws Exception { businessAttributeServiceHook.handleChangeEvent(opContext, platformEvent); // verify - Mockito.verifyNoInteractions(opContext.getRetrieverContext().get().getGraphRetriever()); - Mockito.verifyNoInteractions(opContext.getAspectRetrieverOpt().get()); + Mockito.verifyNoInteractions(opContext.getRetrieverContext().getGraphRetriever()); + Mockito.verifyNoInteractions(opContext.getAspectRetriever()); Mockito.verifyNoInteractions(mockUpdateIndicesService); } @@ -226,13 +224,15 @@ private OperationContext mockOperationContextWithGraph(List graph RetrieverContext mockRetrieverContext = mock(RetrieverContext.class); when(mockRetrieverContext.getAspectRetriever()).thenReturn(mock(AspectRetriever.class)); + when(mockRetrieverContext.getCachingAspectRetriever()) + .thenReturn(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); when(mockRetrieverContext.getGraphRetriever()).thenReturn(graphRetriever); OperationContext opContext = TestOperationContexts.systemContextNoSearchAuthorization(mockRetrieverContext); // reset mock for test - reset(opContext.getAspectRetrieverOpt().get()); + reset(opContext.getAspectRetriever()); if (!graphEdges.isEmpty()) { diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java index e65bf22991736..c08b7fad4dee3 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java @@ -1,12 +1,23 @@ package io.datahubproject.metadata.context; +import static com.linkedin.metadata.Constants.CORP_USER_KEY_ASPECT_NAME; +import static com.linkedin.metadata.Constants.CORP_USER_STATUS_ASPECT_NAME; +import static com.linkedin.metadata.Constants.CORP_USER_STATUS_SUSPENDED; +import static com.linkedin.metadata.Constants.STATUS_ASPECT_NAME; +import static com.linkedin.metadata.Constants.SYSTEM_ACTOR; + import com.datahub.authentication.Authentication; +import com.linkedin.common.Status; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; +import com.linkedin.entity.Aspect; +import com.linkedin.identity.CorpUserStatus; +import com.linkedin.metadata.aspect.AspectRetriever; import com.linkedin.metadata.authorization.PoliciesConfig; import com.linkedin.policy.DataHubPolicyInfo; import java.util.Collection; import java.util.Collections; +import java.util.Map; import java.util.Optional; import java.util.Set; import lombok.Builder; @@ -48,6 +59,43 @@ public Urn getActorUrn() { return UrnUtils.getUrn(authentication.getActor().toUrnStr()); } + /** + * Actor is considered active if the user is not hard-deleted, soft-deleted, and is not suspended + * + * @param aspectRetriever aspect retriever - ideally the SystemEntityClient backed one for caching + * @return active status + */ + public boolean isActive(AspectRetriever aspectRetriever) { + // system cannot be disabled + if (SYSTEM_ACTOR.equals(authentication.getActor().toUrnStr())) { + return true; + } + + Urn selfUrn = UrnUtils.getUrn(authentication.getActor().toUrnStr()); + Map> urnAspectMap = + aspectRetriever.getLatestAspectObjects( + Set.of(selfUrn), + Set.of(STATUS_ASPECT_NAME, CORP_USER_STATUS_ASPECT_NAME, CORP_USER_KEY_ASPECT_NAME)); + + Map aspectMap = urnAspectMap.getOrDefault(selfUrn, Map.of()); + + if (!aspectMap.containsKey(CORP_USER_KEY_ASPECT_NAME)) { + // user is hard deleted + return false; + } + + Status status = + Optional.ofNullable(aspectMap.get(STATUS_ASPECT_NAME)) + .map(a -> new Status(a.data())) + .orElse(new Status().setRemoved(false)); + CorpUserStatus corpUserStatus = + Optional.ofNullable(aspectMap.get(CORP_USER_STATUS_ASPECT_NAME)) + .map(a -> new CorpUserStatus(a.data())) + .orElse(new CorpUserStatus().setStatus("")); + + return !status.isRemoved() && !CORP_USER_STATUS_SUSPENDED.equals(corpUserStatus.getStatus()); + } + /** * The current implementation creates a cache entry unique for the set of policies. * diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java index 9a058c526647c..9158129235b39 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java @@ -16,6 +16,8 @@ import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.utils.AuditStampUtils; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; +import io.datahubproject.metadata.exception.ActorAccessException; +import io.datahubproject.metadata.exception.OperationContextException; import java.util.Collection; import java.util.Objects; import java.util.Optional; @@ -63,6 +65,24 @@ public static OperationContext asSession( @Nonnull Authorizer authorizer, @Nonnull Authentication sessionAuthentication, boolean allowSystemAuthentication) { + return OperationContext.asSession( + systemOperationContext, + requestContext, + authorizer, + sessionAuthentication, + allowSystemAuthentication, + false); + } + + @Nonnull + public static OperationContext asSession( + OperationContext systemOperationContext, + @Nonnull RequestContext requestContext, + @Nonnull Authorizer authorizer, + @Nonnull Authentication sessionAuthentication, + boolean allowSystemAuthentication, + boolean skipCache) + throws ActorAccessException { return systemOperationContext.toBuilder() .operationContextConfig( // update allowed system authentication @@ -72,7 +92,7 @@ public static OperationContext asSession( .authorizationContext(AuthorizationContext.builder().authorizer(authorizer).build()) .requestContext(requestContext) .validationContext(systemOperationContext.getValidationContext()) - .build(sessionAuthentication); + .build(sessionAuthentication, skipCache); } /** @@ -85,10 +105,14 @@ public static OperationContext asSession( public static OperationContext withSearchFlags( OperationContext opContext, Function flagDefaults) { - return opContext.toBuilder() - // update search flags for the request's session - .searchContext(opContext.getSearchContext().withFlagDefaults(flagDefaults)) - .build(opContext.getSessionActorContext()); + try { + return opContext.toBuilder() + // update search flags for the request's session + .searchContext(opContext.getSearchContext().withFlagDefaults(flagDefaults)) + .build(opContext.getSessionActorContext(), false); + } catch (OperationContextException e) { + throw new RuntimeException(e); + } } /** @@ -101,10 +125,14 @@ public static OperationContext withSearchFlags( public static OperationContext withLineageFlags( OperationContext opContext, Function flagDefaults) { - return opContext.toBuilder() - // update lineage flags for the request's session - .searchContext(opContext.getSearchContext().withLineageFlagDefaults(flagDefaults)) - .build(opContext.getSessionActorContext()); + try { + return opContext.toBuilder() + // update lineage flags for the request's session + .searchContext(opContext.getSearchContext().withLineageFlagDefaults(flagDefaults)) + .build(opContext.getSessionActorContext(), false); + } catch (OperationContextException e) { + throw new RuntimeException(e); + } } /** @@ -155,18 +183,22 @@ public static OperationContext asSystem( ? SearchContext.EMPTY : SearchContext.builder().indexConvention(indexConvention).build(); - return OperationContext.builder() - .operationContextConfig(systemConfig) - .systemActorContext(systemActorContext) - .searchContext(systemSearchContext) - .entityRegistryContext(EntityRegistryContext.builder().build(entityRegistry)) - .servicesRegistryContext(servicesRegistryContext) - // Authorizer.EMPTY doesn't actually apply to system auth - .authorizationContext(AuthorizationContext.builder().authorizer(Authorizer.EMPTY).build()) - .retrieverContext(retrieverContext) - .objectMapperContext(objectMapperContext) - .validationContext(validationContext) - .build(systemAuthentication); + try { + return OperationContext.builder() + .operationContextConfig(systemConfig) + .systemActorContext(systemActorContext) + .searchContext(systemSearchContext) + .entityRegistryContext(EntityRegistryContext.builder().build(entityRegistry)) + .servicesRegistryContext(servicesRegistryContext) + // Authorizer.EMPTY doesn't actually apply to system auth + .authorizationContext(AuthorizationContext.builder().authorizer(Authorizer.EMPTY).build()) + .retrieverContext(retrieverContext) + .objectMapperContext(objectMapperContext) + .validationContext(validationContext) + .build(systemAuthentication, false); + } catch (OperationContextException e) { + throw new RuntimeException(e); + } } @Nonnull private final OperationContextConfig operationContextConfig; @@ -177,7 +209,7 @@ public static OperationContext asSystem( @Nonnull private final EntityRegistryContext entityRegistryContext; @Nullable private final ServicesRegistryContext servicesRegistryContext; @Nullable private final RequestContext requestContext; - @Nullable private final RetrieverContext retrieverContext; + @Nonnull private final RetrieverContext retrieverContext; @Nonnull private final ObjectMapperContext objectMapperContext; @Nonnull private final ValidationContext validationContext; @@ -194,13 +226,15 @@ public OperationContext withLineageFlags( public OperationContext asSession( @Nonnull RequestContext requestContext, @Nonnull Authorizer authorizer, - @Nonnull Authentication sessionAuthentication) { + @Nonnull Authentication sessionAuthentication) + throws ActorAccessException { return OperationContext.asSession( this, requestContext, authorizer, sessionAuthentication, - getOperationContextConfig().isAllowSystemAuthentication()); + getOperationContextConfig().isAllowSystemAuthentication(), + false); } @Nonnull @@ -284,17 +318,9 @@ public AuditStamp getAuditStamp() { return getAuditStamp(null); } - public Optional getRetrieverContext() { - return Optional.ofNullable(retrieverContext); - } - - @Nullable + @Nonnull public AspectRetriever getAspectRetriever() { - return getAspectRetrieverOpt().orElse(null); - } - - public Optional getAspectRetrieverOpt() { - return getRetrieverContext().map(RetrieverContext::getAspectRetriever); + return retrieverContext.getAspectRetriever(); } /** @@ -336,10 +362,7 @@ public String getGlobalContextId() { ? EmptyContext.EMPTY : getServicesRegistryContext()) .add(getRequestContext() == null ? EmptyContext.EMPTY : getRequestContext()) - .add( - getRetrieverContext().isPresent() - ? getRetrieverContext().get() - : EmptyContext.EMPTY) + .add(getRetrieverContext()) .add(getObjectMapperContext()) .build() .stream() @@ -364,10 +387,7 @@ public String getSearchContextId() { getServicesRegistryContext() == null ? EmptyContext.EMPTY : getServicesRegistryContext()) - .add( - getRetrieverContext().isPresent() - ? getRetrieverContext().get() - : EmptyContext.EMPTY) + .add(getRetrieverContext()) .build() .stream() .map(ContextInterface::getCacheKeyComponent) @@ -438,6 +458,12 @@ public static class OperationContextBuilder { @Nonnull public OperationContext build(@Nonnull Authentication sessionAuthentication) { + return build(sessionAuthentication, false); + } + + @Nonnull + public OperationContext build( + @Nonnull Authentication sessionAuthentication, boolean skipCache) { final Urn actorUrn = UrnUtils.getUrn(sessionAuthentication.getActor().toUrnStr()); final ActorContext sessionActor = ActorContext.builder() @@ -451,11 +477,20 @@ public OperationContext build(@Nonnull Authentication sessionAuthentication) { .policyInfoSet(this.authorizationContext.getAuthorizer().getActorPolicies(actorUrn)) .groupMembership(this.authorizationContext.getAuthorizer().getActorGroups(actorUrn)) .build(); - return build(sessionActor); + return build(sessionActor, skipCache); } @Nonnull - public OperationContext build(@Nonnull ActorContext sessionActor) { + public OperationContext build(@Nonnull ActorContext sessionActor, boolean skipCache) { + AspectRetriever retriever = + skipCache + ? this.retrieverContext.getAspectRetriever() + : this.retrieverContext.getCachingAspectRetriever(); + + if (!sessionActor.isActive(retriever)) { + throw new ActorAccessException("Actor is not active"); + } + return new OperationContext( this.operationContextConfig, sessionActor, diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java index 9337fbfe3bb00..9afc4138810bb 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java @@ -1,8 +1,10 @@ package io.datahubproject.metadata.context; import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.entity.SearchRetriever; +import java.util.Objects; import java.util.Optional; import javax.annotation.Nonnull; import lombok.Builder; @@ -15,10 +17,37 @@ public class RetrieverContext @Nonnull private final GraphRetriever graphRetriever; @Nonnull private final AspectRetriever aspectRetriever; + @Nonnull private final CachingAspectRetriever cachingAspectRetriever; @Nonnull private final SearchRetriever searchRetriever; @Override public Optional getCacheKeyComponent() { return Optional.empty(); } + + public static class RetrieverContextBuilder { + public RetrieverContext build() { + if (this.aspectRetriever == null && this.cachingAspectRetriever != null) { + this.aspectRetriever = this.cachingAspectRetriever; + } + + if (this.cachingAspectRetriever == null + && this.aspectRetriever instanceof CachingAspectRetriever) { + this.cachingAspectRetriever = (CachingAspectRetriever) this.aspectRetriever; + } + + return new RetrieverContext( + this.graphRetriever, + Objects.requireNonNull(this.aspectRetriever), + Objects.requireNonNull(this.cachingAspectRetriever), + this.searchRetriever); + } + } + + public static final RetrieverContext EMPTY = + RetrieverContext.builder() + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) + .cachingAspectRetriever(CachingAspectRetriever.EMPTY) + .build(); } diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java new file mode 100644 index 0000000000000..bca2594b96430 --- /dev/null +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java @@ -0,0 +1,7 @@ +package io.datahubproject.metadata.exception; + +public class ActorAccessException extends OperationContextException { + public ActorAccessException(String string) { + super(string); + } +} diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java new file mode 100644 index 0000000000000..1aac8dc3e60ec --- /dev/null +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java @@ -0,0 +1,9 @@ +package io.datahubproject.metadata.exception; + +public class OperationContextException extends RuntimeException { + public OperationContextException(String message) { + super(message); + } + + public OperationContextException() {} +} diff --git a/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java b/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java index 42de6b7398c61..4abfbb196f067 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java @@ -8,21 +8,17 @@ import com.linkedin.common.urn.Urn; import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor; import com.linkedin.entity.Aspect; +import com.linkedin.identity.CorpUserInfo; +import com.linkedin.metadata.Constants; import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; -import com.linkedin.metadata.aspect.SystemAspect; -import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult; import com.linkedin.metadata.entity.SearchRetriever; import com.linkedin.metadata.models.registry.ConfigEntityRegistry; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.models.registry.EntityRegistryException; import com.linkedin.metadata.models.registry.MergedEntityRegistry; import com.linkedin.metadata.models.registry.SnapshotEntityRegistry; -import com.linkedin.metadata.query.filter.Filter; -import com.linkedin.metadata.query.filter.RelationshipFilter; -import com.linkedin.metadata.query.filter.SortCriterion; -import com.linkedin.metadata.search.ScrollResult; -import com.linkedin.metadata.search.SearchEntityArray; import com.linkedin.metadata.snapshot.Snapshot; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; @@ -32,15 +28,14 @@ import io.datahubproject.metadata.context.RetrieverContext; import io.datahubproject.metadata.context.ServicesRegistryContext; import io.datahubproject.metadata.context.ValidationContext; -import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.function.Consumer; import java.util.function.Supplier; +import java.util.stream.Collectors; import javax.annotation.Nonnull; import javax.annotation.Nullable; -import lombok.Builder; /** * Useful for testing. If the defaults are not sufficient, try using the .toBuilder() and replacing @@ -81,26 +76,53 @@ public static EntityRegistry defaultEntityRegistry() { return defaultEntityRegistryInstance; } - public static AspectRetriever emptyAspectRetriever( + public static RetrieverContext emptyActiveUsersRetrieverContext( @Nullable Supplier entityRegistrySupplier) { - return new EmptyAspectRetriever( - () -> - Optional.ofNullable(entityRegistrySupplier) - .map(Supplier::get) - .orElse(defaultEntityRegistry())); - } - public static GraphRetriever emptyGraphRetriever = new EmptyGraphRetriever(); - public static SearchRetriever emptySearchRetriever = new EmptySearchRetriever(); + return RetrieverContext.builder() + .cachingAspectRetriever(emptyActiveUsersAspectRetriever(entityRegistrySupplier)) + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) + .build(); + } - public static RetrieverContext emptyRetrieverContext( + public static CachingAspectRetriever emptyActiveUsersAspectRetriever( @Nullable Supplier entityRegistrySupplier) { - return RetrieverContext.builder() - .aspectRetriever(emptyAspectRetriever(entityRegistrySupplier)) - .graphRetriever(emptyGraphRetriever) - .searchRetriever(emptySearchRetriever) - .build(); + return new CachingAspectRetriever.EmptyAspectRetriever() { + + @Nonnull + @Override + public Map> getLatestAspectObjects( + Set urns, Set aspectNames) { + if (urns.stream().allMatch(urn -> urn.toString().startsWith("urn:li:corpuser:")) + && aspectNames.contains(Constants.CORP_USER_KEY_ASPECT_NAME)) { + return urns.stream() + .map( + urn -> + Map.entry( + urn, + Map.of( + Constants.CORP_USER_KEY_ASPECT_NAME, + new Aspect( + new CorpUserInfo() + .setActive(true) + .setEmail(urn.getId()) + .setDisplayName(urn.getId()) + .data())))) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } + return super.getLatestAspectObjects(urns, aspectNames); + } + + @Nonnull + @Override + public EntityRegistry getEntityRegistry() { + return Optional.ofNullable(entityRegistrySupplier) + .map(Supplier::get) + .orElse(defaultEntityRegistry()); + } + }; } public static OperationContext systemContextNoSearchAuthorization( @@ -140,8 +162,10 @@ public static OperationContext systemContextNoSearchAuthorization( RetrieverContext retrieverContext = RetrieverContext.builder() .aspectRetriever(aspectRetriever) - .graphRetriever(emptyGraphRetriever) - .searchRetriever(emptySearchRetriever) + .cachingAspectRetriever( + emptyActiveUsersAspectRetriever(() -> aspectRetriever.getEntityRegistry())) + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) .build(); return systemContextNoSearchAuthorization( () -> retrieverContext.getAspectRetriever().getEntityRegistry(), @@ -208,7 +232,7 @@ public static OperationContext systemContext( RetrieverContext retrieverContext = Optional.ofNullable(retrieverContextSupplier) .map(Supplier::get) - .orElse(emptyRetrieverContext(entityRegistrySupplier)); + .orElse(emptyActiveUsersRetrieverContext(entityRegistrySupplier)); EntityRegistry entityRegistry = Optional.ofNullable(entityRegistrySupplier) @@ -298,66 +322,5 @@ public static OperationContext userContextNoSearchAuthorization( .asSession(requestContext, Authorizer.EMPTY, TEST_USER_AUTH); } - @Builder - public static class EmptyAspectRetriever implements AspectRetriever { - private final Supplier entityRegistrySupplier; - - @Nonnull - @Override - public Map> getLatestAspectObjects( - Set urns, Set aspectNames) { - return Map.of(); - } - - @Nonnull - @Override - public Map> getLatestSystemAspects( - Map> urnAspectNames) { - return Map.of(); - } - - @Nonnull - @Override - public EntityRegistry getEntityRegistry() { - return entityRegistrySupplier.get(); - } - } - - public static class EmptyGraphRetriever implements GraphRetriever { - - @Nonnull - @Override - public RelatedEntitiesScrollResult scrollRelatedEntities( - @Nullable List sourceTypes, - @Nonnull Filter sourceEntityFilter, - @Nullable List destinationTypes, - @Nonnull Filter destinationEntityFilter, - @Nonnull List relationshipTypes, - @Nonnull RelationshipFilter relationshipFilter, - @Nonnull List sortCriterion, - @Nullable String scrollId, - int count, - @Nullable Long startTimeMillis, - @Nullable Long endTimeMillis) { - return new RelatedEntitiesScrollResult(0, 0, null, List.of()); - } - } - - public static class EmptySearchRetriever implements SearchRetriever { - - @Override - public ScrollResult scroll( - @Nonnull List entities, - @Nullable Filter filters, - @Nullable String scrollId, - int count) { - ScrollResult empty = new ScrollResult(); - empty.setEntities(new SearchEntityArray()); - empty.setNumEntities(0); - empty.setPageSize(0); - return empty; - } - } - private TestOperationContexts() {} } diff --git a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java index 3e092e20127ee..f77b244d8f2d8 100644 --- a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java +++ b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java @@ -8,6 +8,7 @@ import com.datahub.authentication.Authentication; import com.datahub.plugins.auth.authorization.Authorizer; import com.linkedin.metadata.models.registry.EntityRegistry; +import io.datahubproject.test.metadata.context.TestOperationContexts; import org.testng.annotations.Test; public class OperationContextTest { @@ -25,7 +26,7 @@ public void testSystemPrivilegeEscalation() { mock(EntityRegistry.class), mock(ServicesRegistryContext.class), null, - mock(RetrieverContext.class), + TestOperationContexts.emptyActiveUsersRetrieverContext(null), mock(ValidationContext.class)); OperationContext opContext = diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java b/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java index 6724f35d840ad..a9871f1ed9948 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java @@ -145,7 +145,7 @@ public String generateAccessToken( _entityService.ingestProposal( systemOperationContext, AspectsBatchImpl.builder() - .mcps(List.of(proposal), auditStamp, systemOperationContext.getRetrieverContext().get()) + .mcps(List.of(proposal), auditStamp, systemOperationContext.getRetrieverContext()) .build(), false); diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index 9348416606d0a..75b4c8e8b002f 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -522,12 +522,12 @@ cache: entityAspectTTLSeconds: # cache user aspects for 20s corpuser: - corpUserKey: 20 + corpUserKey: 300 # 5 min corpUserInfo: 20 corpUserEditableInfo: 20 - corpUserStatus: 20 + corpUserStatus: 300 # 5 min globalTags: 20 - status: 20 + status: 300 # 5 min corpUserCredentials: 20 corpUserSettings: 20 roleMembership: 20 diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java index f5235dc3682fc..3e2823591e168 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java @@ -45,7 +45,8 @@ protected OperationContext javaSystemOperationContext( @Nonnull final SearchService searchService, @Qualifier("baseElasticSearchComponents") BaseElasticSearchComponentsFactory.BaseElasticSearchComponents components, - @Nonnull final ConfigurationProvider configurationProvider) { + @Nonnull final ConfigurationProvider configurationProvider, + @Qualifier("systemEntityClient") @Nonnull final SystemEntityClient systemEntityClient) { EntityServiceAspectRetriever entityServiceAspectRetriever = EntityServiceAspectRetriever.builder() @@ -53,6 +54,9 @@ protected OperationContext javaSystemOperationContext( .entityService(entityService) .build(); + EntityClientAspectRetriever entityClientAspectRetriever = + EntityClientAspectRetriever.builder().entityClient(systemEntityClient).build(); + SystemGraphRetriever systemGraphRetriever = SystemGraphRetriever.builder().graphService(graphService).build(); @@ -68,6 +72,7 @@ protected OperationContext javaSystemOperationContext( components.getIndexConvention(), RetrieverContext.builder() .aspectRetriever(entityServiceAspectRetriever) + .cachingAspectRetriever(entityClientAspectRetriever) .graphRetriever(systemGraphRetriever) .searchRetriever(searchServiceSearchRetriever) .build(), @@ -76,6 +81,7 @@ protected OperationContext javaSystemOperationContext( configurationProvider.getFeatureFlags().isAlternateMCPValidation()) .build()); + entityClientAspectRetriever.setSystemOperationContext(systemOperationContext); entityServiceAspectRetriever.setSystemOperationContext(systemOperationContext); systemGraphRetriever.setSystemOperationContext(systemOperationContext); searchServiceSearchRetriever.setSystemOperationContext(systemOperationContext); @@ -104,7 +110,7 @@ protected OperationContext restliSystemOperationContext( BaseElasticSearchComponentsFactory.BaseElasticSearchComponents components, @Nonnull final ConfigurationProvider configurationProvider) { - EntityClientAspectRetriever entityServiceAspectRetriever = + EntityClientAspectRetriever entityClientAspectRetriever = EntityClientAspectRetriever.builder().entityClient(systemEntityClient).build(); SystemGraphRetriever systemGraphRetriever = @@ -121,7 +127,7 @@ protected OperationContext restliSystemOperationContext( ServicesRegistryContext.builder().restrictedService(restrictedService).build(), components.getIndexConvention(), RetrieverContext.builder() - .aspectRetriever(entityServiceAspectRetriever) + .cachingAspectRetriever(entityClientAspectRetriever) .graphRetriever(systemGraphRetriever) .searchRetriever(searchServiceSearchRetriever) .build(), @@ -130,7 +136,7 @@ protected OperationContext restliSystemOperationContext( configurationProvider.getFeatureFlags().isAlternateMCPValidation()) .build()); - entityServiceAspectRetriever.setSystemOperationContext(systemOperationContext); + entityClientAspectRetriever.setSystemOperationContext(systemOperationContext); systemGraphRetriever.setSystemOperationContext(systemOperationContext); searchServiceSearchRetriever.setSystemOperationContext(systemOperationContext); diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java index 22ce06a5984ea..c04dd25ccd4ac 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java +++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java @@ -84,14 +84,14 @@ public void execute(@Nonnull OperationContext systemOperationContext) throws Exc .aspectName(DATA_PLATFORM_INSTANCE_ASPECT_NAME) .recordTemplate(dataPlatformInstance.get()) .auditStamp(aspectAuditStamp) - .build(systemOperationContext.getAspectRetrieverOpt().get())); + .build(systemOperationContext.getAspectRetriever())); } } _entityService.ingestAspects( systemOperationContext, AspectsBatchImpl.builder() - .retrieverContext(systemOperationContext.getRetrieverContext().get()) + .retrieverContext(systemOperationContext.getRetrieverContext()) .items(items) .build(), true, diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java index eb6bfe17ac198..dac2879487469 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java +++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java @@ -225,7 +225,7 @@ private void ingestPolicy( new AuditStamp() .setActor(Urn.createFromString(Constants.SYSTEM_ACTOR)) .setTime(System.currentTimeMillis()), - systemOperationContext.getRetrieverContext().get()) + systemOperationContext.getRetrieverContext()) .build(), false); } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/GlobalControllerExceptionHandler.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/GlobalControllerExceptionHandler.java similarity index 81% rename from metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/GlobalControllerExceptionHandler.java rename to metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/GlobalControllerExceptionHandler.java index ba0a426fa20e8..c756827cad56b 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/GlobalControllerExceptionHandler.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/GlobalControllerExceptionHandler.java @@ -1,9 +1,11 @@ -package io.datahubproject.openapi; +package io.datahubproject.openapi.config; import com.linkedin.metadata.dao.throttle.APIThrottleException; +import io.datahubproject.metadata.exception.ActorAccessException; import io.datahubproject.openapi.exception.InvalidUrnException; import io.datahubproject.openapi.exception.UnauthorizedException; import java.util.Map; +import javax.annotation.PostConstruct; import lombok.extern.slf4j.Slf4j; import org.springframework.beans.ConversionNotSupportedException; import org.springframework.core.Ordered; @@ -19,6 +21,11 @@ @ControllerAdvice public class GlobalControllerExceptionHandler extends DefaultHandlerExceptionResolver { + @PostConstruct + public void init() { + log.info("GlobalControllerExceptionHandler initialized"); + } + public GlobalControllerExceptionHandler() { setOrder(Ordered.HIGHEST_PRECEDENCE); setWarnLogCategory(getClass().getName()); @@ -52,4 +59,9 @@ public static ResponseEntity> handleUnauthorizedException( UnauthorizedException e) { return new ResponseEntity<>(Map.of("error", e.getMessage()), HttpStatus.FORBIDDEN); } + + @ExceptionHandler(ActorAccessException.class) + public static ResponseEntity> actorAccessException(ActorAccessException e) { + return new ResponseEntity<>(Map.of("error", e.getMessage()), HttpStatus.FORBIDDEN); + } } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java index 579a62c084999..592d7bba4211f 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java @@ -637,7 +637,7 @@ public ResponseEntity createAspect( AspectSpec aspectSpec = lookupAspectSpec(entitySpec, aspectName).get(); ChangeMCP upsert = toUpsertItem( - opContext.getRetrieverContext().get().getAspectRetriever(), + opContext.getRetrieverContext().getAspectRetriever(), urn, aspectSpec, createIfEntityNotExists, @@ -649,7 +649,7 @@ public ResponseEntity createAspect( entityService.ingestProposal( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(upsert)) .build(), async); @@ -725,7 +725,7 @@ public ResponseEntity patchAspect( .build(); ChangeMCP upsert = toUpsertItem( - opContext.getRetrieverContext().get().getAspectRetriever(), + opContext.getRetrieverContext().getAspectRetriever(), validatedUrn(entityUrn), aspectSpec, currentValue, @@ -736,7 +736,7 @@ public ResponseEntity patchAspect( entityService.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(upsert)) .build(), true, diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/test/IdController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/test/IdController.java new file mode 100644 index 0000000000000..99d3879ab9a32 --- /dev/null +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/test/IdController.java @@ -0,0 +1,54 @@ +package io.datahubproject.openapi.operations.test; + +import com.datahub.authentication.Authentication; +import com.datahub.authentication.AuthenticationContext; +import com.datahub.authorization.AuthorizerChain; +import io.datahubproject.metadata.context.OperationContext; +import io.datahubproject.metadata.context.RequestContext; +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.tags.Tag; +import jakarta.servlet.http.HttpServletRequest; +import java.util.List; +import java.util.Map; +import lombok.extern.slf4j.Slf4j; +import org.springframework.http.MediaType; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.GetMapping; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RequestParam; +import org.springframework.web.bind.annotation.RestController; + +@RestController +@RequestMapping("/operations/identity") +@Slf4j +@Tag(name = "Identity", description = "An API for checking identity") +public class IdController { + private final AuthorizerChain authorizerChain; + private final OperationContext systemOperationContext; + + public IdController(OperationContext systemOperationContext, AuthorizerChain authorizerChain) { + this.systemOperationContext = systemOperationContext; + this.authorizerChain = authorizerChain; + } + + @Tag(name = "User") + @GetMapping(path = "/user/urn", produces = MediaType.APPLICATION_JSON_VALUE) + @Operation(summary = "User id") + public ResponseEntity> getUserId( + HttpServletRequest request, + @RequestParam(value = "skipCache", required = false, defaultValue = "false") + Boolean skipCache) { + Authentication authentication = AuthenticationContext.getAuthentication(); + String actorUrnStr = authentication.getActor().toUrnStr(); + + OperationContext.asSession( + systemOperationContext, + RequestContext.builder().buildOpenapi(actorUrnStr, request, "getUserIdentity", List.of()), + authorizerChain, + authentication, + true, + skipCache); + + return ResponseEntity.ok(Map.of("urn", actorUrnStr)); + } +} diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java index c38f2db0eefbb..ca425810c87a0 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java @@ -491,7 +491,7 @@ public static List> ingestBatchProposal( try { AspectsBatch batch = AspectsBatchImpl.builder() - .mcps(serviceProposals, auditStamp, opContext.getRetrieverContext().get()) + .mcps(serviceProposals, auditStamp, opContext.getRetrieverContext()) .build(); Map> resultMap = diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java index 56a7955b9fe87..b1c5709ef0147 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java @@ -203,7 +203,7 @@ protected AspectsBatch toMCPBatch( objectMapper.writeValueAsString(aspect.getValue().get("systemMetadata")))); } - items.add(builder.build(opContext.getAspectRetrieverOpt().get())); + items.add(builder.build(opContext.getAspectRetriever())); } } } @@ -211,7 +211,7 @@ protected AspectsBatch toMCPBatch( return AspectsBatchImpl.builder() .items(items) - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .build(); } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java index ce7fd73f99b9e..af13cd3aab051 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java @@ -554,14 +554,14 @@ protected AspectsBatch toMCPBatch( GenericRecordUtils.JSON, aspectSpec)); - items.add(builder.build(opContext.getRetrieverContext().get().getAspectRetriever())); + items.add(builder.build(opContext.getRetrieverContext().getAspectRetriever())); } } } } return AspectsBatchImpl.builder() .items(items) - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .build(); } diff --git a/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entitiesV2.restspec.json b/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entitiesV2.restspec.json index 33cfba0f27802..27731af9ffaa7 100644 --- a/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entitiesV2.restspec.json +++ b/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entitiesV2.restspec.json @@ -19,6 +19,10 @@ "name" : "aspects", "type" : "{ \"type\" : \"array\", \"items\" : \"string\" }", "optional" : true + }, { + "name" : "alwaysIncludeKeyAspect", + "type" : "boolean", + "optional" : true } ] }, { "method" : "batch_get", @@ -27,6 +31,10 @@ "name" : "aspects", "type" : "{ \"type\" : \"array\", \"items\" : \"string\" }", "optional" : true + }, { + "name" : "alwaysIncludeKeyAspect", + "type" : "boolean", + "optional" : true } ] } ], "entity" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json index 9bf7f97b34be1..9c5f41281fcfb 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json @@ -182,6 +182,10 @@ "name" : "aspects", "type" : "{ \"type\" : \"array\", \"items\" : \"string\" }", "optional" : true + }, { + "name" : "alwaysIncludeKeyAspect", + "type" : "boolean", + "optional" : true } ] }, { "method" : "batch_get", @@ -190,6 +194,10 @@ "name" : "aspects", "type" : "{ \"type\" : \"array\", \"items\" : \"string\" }", "optional" : true + }, { + "name" : "alwaysIncludeKeyAspect", + "type" : "boolean", + "optional" : true } ] } ], "entity" : { diff --git a/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java b/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java index cf6e571cb8cbe..b85f22e781d0b 100644 --- a/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java +++ b/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java @@ -45,12 +45,34 @@ // Consider renaming this to datahub client. public interface EntityClient { + /** + * This version follows the legacy behavior of returning key aspects regardless of whether they + * exist + * + * @param opContext operation context + * @param entityName entity type + * @param urn urn id for the entity + * @param aspectNames set of aspects + * @return requested entity/aspects + */ + @Deprecated @Nullable - EntityResponse getV2( + default EntityResponse getV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Urn urn, @Nullable final Set aspectNames) + throws RemoteInvocationException, URISyntaxException { + return getV2(opContext, entityName, urn, aspectNames, true); + } + + @Nullable + EntityResponse getV2( + @Nonnull OperationContext opContext, + @Nonnull String entityName, + @Nonnull final Urn urn, + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException; @Nonnull @@ -58,12 +80,34 @@ EntityResponse getV2( Entity get(@Nonnull OperationContext opContext, @Nonnull final Urn urn) throws RemoteInvocationException; + /** + * This version follows the legacy behavior of returning key aspects regardless of whether they + * exist + * + * @param opContext operation context + * @param entityName entity type + * @param urns urn ids for the entities + * @param aspectNames set of aspects + * @return requested entity/aspects + */ + @Deprecated @Nonnull - Map batchGetV2( + default Map batchGetV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Set urns, @Nullable final Set aspectNames) + throws RemoteInvocationException, URISyntaxException { + return batchGetV2(opContext, entityName, urns, aspectNames, true); + } + + @Nonnull + Map batchGetV2( + @Nonnull OperationContext opContext, + @Nonnull String entityName, + @Nonnull final Set urns, + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException; @Nonnull @@ -589,27 +633,38 @@ void rollbackIngestion( @Nullable default Aspect getLatestAspectObject( - @Nonnull OperationContext opContext, @Nonnull Urn urn, @Nonnull String aspectName) + @Nonnull OperationContext opContext, + @Nonnull Urn urn, + @Nonnull String aspectName, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { - return getLatestAspects(opContext, Set.of(urn), Set.of(aspectName)) + return getLatestAspects(opContext, Set.of(urn), Set.of(aspectName), alwaysIncludeKeyAspect) .getOrDefault(urn, Map.of()) .get(aspectName); } @Nonnull default Map> getLatestAspects( - @Nonnull OperationContext opContext, @Nonnull Set urns, @Nonnull Set aspectNames) + @Nonnull OperationContext opContext, + @Nonnull Set urns, + @Nonnull Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { String entityName = urns.stream().findFirst().map(Urn::getEntityType).get(); - return entityResponseToAspectMap(batchGetV2(opContext, entityName, urns, aspectNames)); + return entityResponseToAspectMap( + batchGetV2(opContext, entityName, urns, aspectNames, alwaysIncludeKeyAspect)); } @Nonnull default Map> getLatestSystemAspect( - @Nonnull OperationContext opContext, @Nonnull Set urns, @Nonnull Set aspectNames) + @Nonnull OperationContext opContext, + @Nonnull Set urns, + @Nonnull Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { String entityName = urns.stream().findFirst().map(Urn::getEntityType).get(); return entityResponseToSystemAspectMap( - batchGetV2(opContext, entityName, urns, aspectNames), opContext.getEntityRegistry()); + batchGetV2(opContext, entityName, urns, aspectNames, alwaysIncludeKeyAspect), + opContext.getEntityRegistry()); } } diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java index 516902601f08a..8d4c5e9228a71 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java @@ -156,10 +156,15 @@ public EntityResponse getV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Urn urn, - @Nullable final Set aspectNames) + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { final EntitiesV2GetRequestBuilder requestBuilder = - ENTITIES_V2_REQUEST_BUILDERS.get().aspectsParam(aspectNames).id(urn.toString()); + ENTITIES_V2_REQUEST_BUILDERS + .get() + .aspectsParam(aspectNames) + .id(urn.toString()) + .alwaysIncludeKeyAspectParam(alwaysIncludeKeyAspect); return sendClientRequest(requestBuilder, opContext.getSessionAuthentication()).getEntity(); } @@ -241,7 +246,8 @@ public Map batchGetV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Set urns, - @Nullable final Set aspectNames) + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { Map responseMap = new HashMap<>(); @@ -260,6 +266,7 @@ public Map batchGetV2( ENTITIES_V2_REQUEST_BUILDERS .batchGet() .aspectsParam(aspectNames) + .alwaysIncludeKeyAspectParam(alwaysIncludeKeyAspect) .ids( batch.stream() .map(Urn::toString) diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java index 2637e2d067c6d..aa17f1951bc91 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java @@ -59,6 +59,6 @@ public Map batchGetV2NoCache( @Nonnull Set urns, @Nullable Set aspectNames) throws RemoteInvocationException, URISyntaxException { - return super.batchGetV2(opContext, entityName, urns, aspectNames); + return super.batchGetV2(opContext, entityName, urns, aspectNames, false); } } diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java index 6033ead36f10e..30b187da00e91 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java @@ -309,7 +309,7 @@ private Task ingestProposals( log.debug("Proposals: {}", metadataChangeProposals); try { final AspectsBatch batch = AspectsBatchImpl.builder() - .mcps(metadataChangeProposals, auditStamp, opContext.getRetrieverContext().get(), + .mcps(metadataChangeProposals, auditStamp, opContext.getRetrieverContext(), opContext.getValidationContext().isAlternateValidation()) .build(); diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java index 20209ddf44d64..896d81d3cbecc 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java @@ -64,7 +64,8 @@ public class EntityV2Resource extends CollectionResourceTaskTemplate get( - @Nonnull String urnStr, @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames) + @Nonnull String urnStr, @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames, + @QueryParam(PARAM_ALWAYS_INCLUDE_KEY_ASPECT) @Optional @Nullable Boolean alwaysIncludeKeyAspect) throws URISyntaxException { log.debug("GET V2 {}", urnStr); final Urn urn = Urn.createFromString(urnStr); @@ -90,7 +91,7 @@ public Task get( ? opContext.getEntityAspectNames(entityName) : new HashSet<>(Arrays.asList(aspectNames)); try { - return _entityService.getEntityV2(opContext, entityName, urn, projectedAspects); + return _entityService.getEntityV2(opContext, entityName, urn, projectedAspects, alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect); } catch (Exception e) { throw new RuntimeException( String.format( @@ -106,7 +107,8 @@ public Task get( @WithSpan public Task> batchGet( @Nonnull Set urnStrs, - @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames) + @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames, + @QueryParam(PARAM_ALWAYS_INCLUDE_KEY_ASPECT) @Optional @Nullable Boolean alwaysIncludeKeyAspect) throws URISyntaxException { log.debug("BATCH GET V2 {}", urnStrs.toString()); final Set urns = new HashSet<>(); @@ -138,7 +140,7 @@ public Task> batchGet( ? opContext.getEntityAspectNames(entityName) : new HashSet<>(Arrays.asList(aspectNames)); try { - return _entityService.getEntitiesV2(opContext, entityName, urns, projectedAspects); + return _entityService.getEntitiesV2(opContext, entityName, urns, projectedAspects, alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect); } catch (Exception e) { throw new RuntimeException( String.format( diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliConstants.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliConstants.java index ef79a404c2145..11df52ad66709 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliConstants.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliConstants.java @@ -1,5 +1,7 @@ package com.linkedin.metadata.resources.restli; +import javax.annotation.Nullable; + public final class RestliConstants { private RestliConstants() {} @@ -21,6 +23,7 @@ private RestliConstants() {} public static final String PARAM_INPUT = "input"; public static final String PARAM_MAX_HOPS = "maxHops"; public static final String PARAM_ASPECTS = "aspects"; + public static final String PARAM_ALWAYS_INCLUDE_KEY_ASPECT = "alwaysIncludeKeyAspect"; public static final String PARAM_FILTER = "filter"; public static final String PARAM_GROUP = "group"; public static final String PARAM_SORT = "sort"; diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java index 185874fac1382..a2092405da3ff 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java @@ -8,6 +8,7 @@ import com.linkedin.parseq.Task; import com.linkedin.restli.common.HttpStatus; import com.linkedin.restli.server.RestLiServiceException; +import io.datahubproject.metadata.exception.ActorAccessException; import java.util.Optional; import java.util.function.Supplier; import javax.annotation.Nonnull; @@ -38,6 +39,8 @@ public static Task toTask(@Nonnull Supplier supplier) { if (throwable instanceof IllegalArgumentException || throwable.getCause() instanceof IllegalArgumentException) { finalException = badRequestException(throwable.getMessage()); + } else if (throwable.getCause() instanceof ActorAccessException) { + finalException = forbidden(throwable.getCause().getMessage()); } else if (throwable instanceof APIThrottleException) { finalException = apiThrottled(throwable.getMessage()); } else if (throwable instanceof RestLiServiceException) { @@ -109,4 +112,9 @@ public static RestLiServiceException invalidArgumentsException(@Nullable String public static RestLiServiceException apiThrottled(@Nullable String message) { return new RestLiServiceException(HttpStatus.S_429_TOO_MANY_REQUESTS, message); } + + @Nonnull + public static RestLiServiceException forbidden(@Nullable String message) { + return new RestLiServiceException(HttpStatus.S_403_FORBIDDEN, message); + } } diff --git a/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java b/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java index a39401c170a11..037b5b81fd4df 100644 --- a/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java +++ b/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java @@ -100,7 +100,7 @@ public void testAsyncDefaultAspects() throws URISyntaxException { .recordTemplate(mcp.getAspect()) .auditStamp(new AuditStamp()) .metadataChangeProposal(mcp) - .build(opContext.getAspectRetrieverOpt().get()); + .build(opContext.getAspectRetriever()); when(aspectDao.runInTransactionWithRetry(any(), any(), anyInt())) .thenReturn( List.of(List.of( diff --git a/smoke-test/tests/tokens/revokable_access_token_test.py b/smoke-test/tests/tokens/revokable_access_token_test.py index af29437c051e1..006daae39333e 100644 --- a/smoke-test/tests/tokens/revokable_access_token_test.py +++ b/smoke-test/tests/tokens/revokable_access_token_test.py @@ -9,6 +9,8 @@ wait_for_writes_to_sync, ) +from .token_utils import listUsers, removeUser + pytestmark = pytest.mark.no_cypress_suite1 # Disable telemetry @@ -490,45 +492,3 @@ def getAccessTokenMetadata(session, token): response.raise_for_status() return response.json() - - -def removeUser(session, urn): - # Remove user - json = { - "query": """mutation removeUser($urn: String!) { - removeUser(urn: $urn) - }""", - "variables": {"urn": urn}, - } - - response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) - - response.raise_for_status() - return response.json() - - -def listUsers(session): - input = { - "start": "0", - "count": "20", - } - - # list users - json = { - "query": """query listUsers($input: ListUsersInput!) { - listUsers(input: $input) { - start - count - total - users { - username - } - } - }""", - "variables": {"input": input}, - } - - response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) - - response.raise_for_status() - return response.json() diff --git a/smoke-test/tests/tokens/session_access_token_test.py b/smoke-test/tests/tokens/session_access_token_test.py new file mode 100644 index 0000000000000..a16abc4445303 --- /dev/null +++ b/smoke-test/tests/tokens/session_access_token_test.py @@ -0,0 +1,173 @@ +import os +import time + +import pytest +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.metadata.schema_classes import AuditStampClass, CorpUserStatusClass +from requests.exceptions import HTTPError + +from tests.utils import ( + get_admin_credentials, + get_frontend_url, + login_as, + wait_for_writes_to_sync, +) + +from .token_utils import getUserId, listUsers, removeUser + +pytestmark = pytest.mark.no_cypress_suite1 + +# Disable telemetry +os.environ["DATAHUB_TELEMETRY_ENABLED"] = "false" + +(admin_user, admin_pass) = get_admin_credentials() +user_urn = "urn:li:corpuser:sessionUser" + + +@pytest.fixture(scope="class") +def custom_user_session(): + """Fixture to execute setup before and tear down after all tests are run""" + admin_session = login_as(admin_user, admin_pass) + + res_data = removeUser(admin_session, user_urn) + assert res_data + assert "error" not in res_data + + # Test getting the invite token + get_invite_token_json = { + "query": """query getInviteToken($input: GetInviteTokenInput!) { + getInviteToken(input: $input){ + inviteToken + } + }""", + "variables": {"input": {}}, + } + + get_invite_token_response = admin_session.post( + f"{get_frontend_url()}/api/v2/graphql", json=get_invite_token_json + ) + get_invite_token_response.raise_for_status() + get_invite_token_res_data = get_invite_token_response.json() + + assert get_invite_token_res_data + assert get_invite_token_res_data["data"] + invite_token = get_invite_token_res_data["data"]["getInviteToken"]["inviteToken"] + assert invite_token is not None + assert "error" not in invite_token + + # Pass the invite token when creating the user + sign_up_json = { + "fullName": "Test Session User", + "email": "sessionUser", + "password": "sessionUser", + "title": "Date Engineer", + "inviteToken": invite_token, + } + + sign_up_response = admin_session.post( + f"{get_frontend_url()}/signUp", json=sign_up_json + ) + sign_up_response.raise_for_status() + assert sign_up_response + assert "error" not in sign_up_response + # Sleep for eventual consistency + wait_for_writes_to_sync() + + # signUp will override the session cookie to the new user to be signed up. + admin_session.cookies.clear() + admin_session = login_as(admin_user, admin_pass) + + # Make user created user is there. + res_data = listUsers(admin_session) + assert res_data["data"] + assert res_data["data"]["listUsers"] + assert {"username": "sessionUser"} in res_data["data"]["listUsers"]["users"] + + yield login_as(sign_up_json["email"], sign_up_json["password"]) + + # Delete created user + res_data = removeUser(admin_session, user_urn) + assert res_data + assert res_data["data"] + assert res_data["data"]["removeUser"] is True + # Sleep for eventual consistency + wait_for_writes_to_sync() + + # Make user created user is not there. + res_data = listUsers(admin_session) + assert res_data["data"] + assert res_data["data"]["listUsers"] + assert {"username": "sessionUser"} not in res_data["data"]["listUsers"]["users"] + + +@pytest.mark.dependency() +def test_soft_delete(graph_client, custom_user_session): + # assert initial access + assert getUserId(custom_user_session) == {"urn": user_urn} + + graph_client.soft_delete_entity(urn=user_urn) + wait_for_writes_to_sync() + + with pytest.raises(HTTPError) as req_info: + getUserId(custom_user_session) + assert "403 Client Error: Forbidden" in str(req_info.value) + + # undo soft delete + graph_client.set_soft_delete_status(urn=user_urn, delete=False) + wait_for_writes_to_sync() + + +@pytest.mark.dependency(depends=["test_soft_delete"]) +def test_suspend(graph_client, custom_user_session): + # assert initial access + assert getUserId(custom_user_session) == {"urn": user_urn} + + graph_client.emit( + MetadataChangeProposalWrapper( + entityType="corpuser", + entityUrn=user_urn, + changeType="UPSERT", + aspectName="corpUserStatus", + aspect=CorpUserStatusClass( + status="SUSPENDED", + lastModified=AuditStampClass( + time=int(time.time() * 1000.0), actor="urn:li:corpuser:unknown" + ), + ), + ) + ) + wait_for_writes_to_sync() + + with pytest.raises(HTTPError) as req_info: + getUserId(custom_user_session) + assert "403 Client Error: Forbidden" in str(req_info.value) + + # undo suspend + graph_client.emit( + MetadataChangeProposalWrapper( + entityType="corpuser", + entityUrn=user_urn, + changeType="UPSERT", + aspectName="corpUserStatus", + aspect=CorpUserStatusClass( + status="ACTIVE", + lastModified=AuditStampClass( + time=int(time.time() * 1000.0), actor="urn:li:corpuser:unknown" + ), + ), + ) + ) + wait_for_writes_to_sync() + + +@pytest.mark.dependency(depends=["test_suspend"]) +def test_hard_delete(graph_client, custom_user_session): + # assert initial access + assert getUserId(custom_user_session) == {"urn": user_urn} + + graph_client.hard_delete_entity(urn=user_urn) + wait_for_writes_to_sync() + + with pytest.raises(HTTPError) as req_info: + getUserId(custom_user_session) + assert "403 Client Error: Forbidden" in str(req_info.value) diff --git a/smoke-test/tests/tokens/token_utils.py b/smoke-test/tests/tokens/token_utils.py new file mode 100644 index 0000000000000..10558e7085de7 --- /dev/null +++ b/smoke-test/tests/tokens/token_utils.py @@ -0,0 +1,53 @@ +from tests.utils import get_frontend_url + + +def getUserId(session): + response = session.get( + f"{get_frontend_url()}/openapi/operations/identity/user/urn", + params={"skipCache": "true"}, + ) + + response.raise_for_status() + return response.json() + + +def removeUser(session, urn): + # Remove user + json = { + "query": """mutation removeUser($urn: String!) { + removeUser(urn: $urn) + }""", + "variables": {"urn": urn}, + } + + response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) + + response.raise_for_status() + return response.json() + + +def listUsers(session): + input = { + "start": "0", + "count": "20", + } + + # list users + json = { + "query": """query listUsers($input: ListUsersInput!) { + listUsers(input: $input) { + start + count + total + users { + username + } + } + }""", + "variables": {"input": input}, + } + + response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) + + response.raise_for_status() + return response.json() From 83904b7f351c9ea8b9ac7737892b2b21caedb720 Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Wed, 18 Dec 2024 17:02:16 -0500 Subject: [PATCH 3/8] fix(env) Fix forms hook env var default config (#12155) --- .../configuration/src/main/resources/application.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index 75b4c8e8b002f..9010d77015f16 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -561,7 +561,7 @@ springdoc.api-docs.groups.enabled: true forms: hook: - enabled: { $FORMS_HOOK_ENABLED:true } + enabled: ${FORMS_HOOK_ENABLED:true} consumerGroupSuffix: ${FORMS_HOOK_CONSUMER_GROUP_SUFFIX:} businessAttribute: From da8f8221977444644596da40e676e15362bd7a2d Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Wed, 18 Dec 2024 14:36:10 -0800 Subject: [PATCH 4/8] feat(ingest/mlflow): Support configurable base_external_url (#12167) --- .../src/datahub/ingestion/source/mlflow.py | 35 ++++++++++++++++--- .../tests/unit/test_mlflow_source.py | 13 +++++++ 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/mlflow.py b/metadata-ingestion/src/datahub/ingestion/source/mlflow.py index cef6d2b1bb577..26d160acf330c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mlflow.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mlflow.py @@ -38,16 +38,30 @@ class MLflowConfig(EnvConfigMixin): tracking_uri: Optional[str] = Field( default=None, - description="Tracking server URI. If not set, an MLflow default tracking_uri is used (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)", + description=( + "Tracking server URI. If not set, an MLflow default tracking_uri is used" + " (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)" + ), ) registry_uri: Optional[str] = Field( default=None, - description="Registry server URI. If not set, an MLflow default registry_uri is used (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)", + description=( + "Registry server URI. If not set, an MLflow default registry_uri is used" + " (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)" + ), ) model_name_separator: str = Field( default="_", description="A string which separates model name from its version (e.g. model_1 or model-1)", ) + base_external_url: Optional[str] = Field( + default=None, + description=( + "Base URL to use when constructing external URLs to MLflow." + " If not set, tracking_uri is used if it's an HTTP URL." + " If neither is set, external URLs are not generated." + ), + ) @dataclass @@ -279,12 +293,23 @@ def _make_ml_model_urn(self, model_version: ModelVersion) -> str: ) return urn - def _make_external_url(self, model_version: ModelVersion) -> Union[None, str]: + def _get_base_external_url_from_tracking_uri(self) -> Optional[str]: + if isinstance( + self.client.tracking_uri, str + ) and self.client.tracking_uri.startswith("http"): + return self.client.tracking_uri + else: + return None + + def _make_external_url(self, model_version: ModelVersion) -> Optional[str]: """ Generate URL for a Model Version to MLflow UI. """ - base_uri = self.client.tracking_uri - if base_uri.startswith("http"): + base_uri = ( + self.config.base_external_url + or self._get_base_external_url_from_tracking_uri() + ) + if base_uri: return f"{base_uri.rstrip('/')}/#/models/{model_version.name}/versions/{model_version.version}" else: return None diff --git a/metadata-ingestion/tests/unit/test_mlflow_source.py b/metadata-ingestion/tests/unit/test_mlflow_source.py index d213dd92352e6..e882296b6f331 100644 --- a/metadata-ingestion/tests/unit/test_mlflow_source.py +++ b/metadata-ingestion/tests/unit/test_mlflow_source.py @@ -136,3 +136,16 @@ def test_make_external_link_remote(source, model_version): url = source._make_external_url(model_version) assert url == expected_url + + +def test_make_external_link_remote_via_config(source, model_version): + custom_base_url = "https://custom-server.org" + source.config.base_external_url = custom_base_url + source.client = MlflowClient( + tracking_uri="https://dummy-mlflow-tracking-server.org" + ) + expected_url = f"{custom_base_url}/#/models/{model_version.name}/versions/{model_version.version}" + + url = source._make_external_url(model_version) + + assert url == expected_url From 4392d72456faae5f0f59eb09756287182feec56b Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 18 Dec 2024 20:29:34 -0500 Subject: [PATCH 5/8] fix(cli/properties): fix data type validation (#12170) --- .../structuredproperties.py | 28 +++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py index e37281dea86e1..619f69b016262 100644 --- a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py +++ b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py @@ -14,7 +14,7 @@ PropertyValueClass, StructuredPropertyDefinitionClass, ) -from datahub.metadata.urns import StructuredPropertyUrn, Urn +from datahub.metadata.urns import DataTypeUrn, StructuredPropertyUrn, Urn from datahub.utilities.urns._urn_base import URN_TYPES logging.basicConfig(level=logging.INFO) @@ -86,19 +86,31 @@ class StructuredProperties(ConfigModel): @validator("type") def validate_type(cls, v: str) -> str: - # Convert to lowercase if needed - if not v.islower(): + # This logic is somewhat hacky, since we need to deal with + # 1. fully qualified urns + # 2. raw data types, that need to get the datahub namespace prefix + # While keeping the user-facing interface and error messages clean. + + if not v.startswith("urn:li:") and not v.islower(): + # Convert to lowercase if needed + v = v.lower() logger.warning( - f"Structured property type should be lowercase. Updated to {v.lower()}" + f"Structured property type should be lowercase. Updated to {v}" ) - v = v.lower() + + urn = Urn.make_data_type_urn(v) # Check if type is allowed - if not AllowedTypes.check_allowed_type(v): + data_type_urn = DataTypeUrn.from_string(urn) + unqualified_data_type = data_type_urn.id + if unqualified_data_type.startswith("datahub."): + unqualified_data_type = unqualified_data_type[len("datahub.") :] + if not AllowedTypes.check_allowed_type(unqualified_data_type): raise ValueError( - f"Type {v} is not allowed. Allowed types are {AllowedTypes.values()}" + f"Type {unqualified_data_type} is not allowed. Allowed types are {AllowedTypes.values()}" ) - return v + + return urn @property def fqn(self) -> str: From 48f3cc578589c5c0379d5117756f01a0228669b4 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Wed, 18 Dec 2024 21:53:20 -0600 Subject: [PATCH 6/8] fix(pgsql): Postgres doesn't support UNION select with FOR UPDATE (#12169) --- .../metadata/entity/ebean/EbeanAspectDao.java | 87 ++++++++++++++++++- .../metadata/config/EbeanConfiguration.java | 1 + .../src/main/resources/application.yaml | 1 + 3 files changed, 85 insertions(+), 4 deletions(-) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java index bd6cc67561b88..ea580a97c5188 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java @@ -93,8 +93,14 @@ public class EbeanAspectDao implements AspectDao, AspectMigrationsDao { */ private final LoadingCache locks; + private final String batchGetMethod; + public EbeanAspectDao(@Nonnull final Database server, EbeanConfiguration ebeanConfiguration) { _server = server; + this.batchGetMethod = + ebeanConfiguration.getBatchGetMethod() != null + ? ebeanConfiguration.getBatchGetMethod() + : "IN"; if (ebeanConfiguration.getLocking().isEnabled()) { this.locks = CacheBuilder.newBuilder() @@ -371,23 +377,37 @@ private List batchGet( final int totalPageCount = QueryUtils.getTotalPageCount(keys.size(), keysCount); final List finalResult = - batchGetUnion(new ArrayList<>(keys), keysCount, position, forUpdate); + batchGetSelectString(new ArrayList<>(keys), keysCount, position, forUpdate); while (QueryUtils.hasMore(position, keysCount, totalPageCount)) { position += keysCount; final List oneStatementResult = - batchGetUnion(new ArrayList<>(keys), keysCount, position, forUpdate); + batchGetSelectString(new ArrayList<>(keys), keysCount, position, forUpdate); finalResult.addAll(oneStatementResult); } return finalResult; } + @Nonnull + private List batchGetSelectString( + @Nonnull final List keys, + final int keysCount, + final int position, + boolean forUpdate) { + + if (batchGetMethod.equals("IN")) { + return batchGetIn(keys, keysCount, position, forUpdate); + } + + return batchGetUnion(keys, keysCount, position, forUpdate); + } + /** * Builds a single SELECT statement for batch get, which selects one entity, and then can be * UNION'd with other SELECT statements. */ - private String batchGetSelect( + private String batchGetSelectString( final int selectId, @Nonnull final String urn, @Nonnull final String aspect, @@ -434,7 +454,7 @@ private List batchGetUnion( final Map params = new HashMap<>(); for (int index = position; index < end; index++) { sb.append( - batchGetSelect( + batchGetSelectString( index - position, keys.get(index).getUrn(), keys.get(index).getAspect(), @@ -467,6 +487,65 @@ private List batchGetUnion( return query.findList(); } + @Nonnull + private List batchGetIn( + @Nonnull final List keys, + final int keysCount, + final int position, + boolean forUpdate) { + validateConnection(); + + // Build a single SELECT with IN clause using composite key comparison + // Query will look like: + // SELECT * FROM metadata_aspect WHERE (urn, aspect, version) IN + // (('urn0', 'aspect0', 0), ('urn1', 'aspect1', 1)) + final StringBuilder sb = new StringBuilder(); + sb.append( + "SELECT urn, aspect, version, metadata, systemMetadata, createdOn, createdBy, createdFor "); + sb.append("FROM metadata_aspect_v2 WHERE (urn, aspect, version) IN ("); + + final int end = Math.min(keys.size(), position + keysCount); + final Map params = new HashMap<>(); + + for (int index = position; index < end; index++) { + int paramIndex = index - position; + String urnParam = "urn" + paramIndex; + String aspectParam = "aspect" + paramIndex; + String versionParam = "version" + paramIndex; + + params.put(urnParam, keys.get(index).getUrn()); + params.put(aspectParam, keys.get(index).getAspect()); + params.put(versionParam, keys.get(index).getVersion()); + + sb.append("(:" + urnParam + ", :" + aspectParam + ", :" + versionParam + ")"); + + if (index != end - 1) { + sb.append(","); + } + } + + sb.append(")"); + + if (forUpdate) { + sb.append(" FOR UPDATE"); + } + + final RawSql rawSql = + RawSqlBuilder.parse(sb.toString()) + .columnMapping(EbeanAspectV2.URN_COLUMN, "key.urn") + .columnMapping(EbeanAspectV2.ASPECT_COLUMN, "key.aspect") + .columnMapping(EbeanAspectV2.VERSION_COLUMN, "key.version") + .create(); + + final Query query = _server.find(EbeanAspectV2.class).setRawSql(rawSql); + + for (Map.Entry param : params.entrySet()) { + query.setParameter(param.getKey(), param.getValue()); + } + + return query.findList(); + } + @Override @Nonnull public ListResult listUrns( diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/EbeanConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/EbeanConfiguration.java index 47b406e695a3f..6eb31e14a2d3b 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/EbeanConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/EbeanConfiguration.java @@ -23,6 +23,7 @@ public class EbeanConfiguration { private boolean autoCreateDdl; private boolean postgresUseIamAuth; private LockingConfiguration locking; + private String batchGetMethod; public static final EbeanConfiguration testDefault = EbeanConfiguration.builder().locking(LockingConfiguration.testDefault).build(); diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index 9010d77015f16..b997bc108e4ba 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -164,6 +164,7 @@ ebean: waitTimeoutMillis: ${EBEAN_WAIT_TIMEOUT_MILLIS:1000} autoCreateDdl: ${EBEAN_AUTOCREATE:false} postgresUseIamAuth: ${EBEAN_POSTGRES_USE_AWS_IAM_AUTH:false} + batchGetMethod: ${EBEAN_BATCH_GET_METHOD:IN} # Alternative UNION locking: enabled: ${EBEAN_LOCKING_ENABLED:false} durationSeconds: ${EBEAN_LOCKING_DURATION_SECONDS:60} From 953893cf2e72e71580b21bdfc12592fca572e13b Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Thu, 19 Dec 2024 12:39:47 +0530 Subject: [PATCH 7/8] refactor(ingest/kafka-connect): define interface for new connector impl (#12149) --- metadata-ingestion/setup.py | 2 +- .../ingestion/source/kafka/kafka_connect.py | 1468 ----------------- .../source/kafka_connect/__init__.py | 0 .../ingestion/source/kafka_connect/common.py | 202 +++ .../source/kafka_connect/kafka_connect.py | 367 +++++ .../source/kafka_connect/sink_connectors.py | 341 ++++ .../source/kafka_connect/source_connectors.py | 570 +++++++ 7 files changed, 1481 insertions(+), 1469 deletions(-) delete mode 100644 metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/kafka_connect/__init__.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 6334b3abbb8a0..c6994dd6d5aa6 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -741,7 +741,7 @@ "hive-metastore = datahub.ingestion.source.sql.hive_metastore:HiveMetastoreSource", "json-schema = datahub.ingestion.source.schema.json_schema:JsonSchemaSource", "kafka = datahub.ingestion.source.kafka.kafka:KafkaSource", - "kafka-connect = datahub.ingestion.source.kafka.kafka_connect:KafkaConnectSource", + "kafka-connect = datahub.ingestion.source.kafka_connect.kafka_connect:KafkaConnectSource", "ldap = datahub.ingestion.source.ldap:LDAPSource", "looker = datahub.ingestion.source.looker.looker_source:LookerDashboardSource", "lookml = datahub.ingestion.source.looker.lookml_source:LookMLSource", diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py deleted file mode 100644 index 23a99ccb310e1..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py +++ /dev/null @@ -1,1468 +0,0 @@ -import logging -import re -from dataclasses import dataclass, field -from typing import Dict, Iterable, List, Optional, Tuple - -import jpype -import jpype.imports -import requests -from pydantic.fields import Field -from sqlalchemy.engine.url import make_url - -import datahub.emitter.mce_builder as builder -import datahub.metadata.schema_classes as models -from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.configuration.source_common import ( - DatasetLineageProviderConfigBase, - PlatformInstanceConfigMixin, -) -from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.api.decorators import ( - SourceCapability, - SupportStatus, - capability, - config_class, - platform_name, - support_status, -) -from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source -from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( - get_platform_from_sqlalchemy_uri, -) -from datahub.ingestion.source.state.stale_entity_removal_handler import ( - StaleEntityRemovalHandler, - StaleEntityRemovalSourceReport, - StatefulStaleMetadataRemovalConfig, -) -from datahub.ingestion.source.state.stateful_ingestion_base import ( - StatefulIngestionConfigBase, - StatefulIngestionSourceBase, -) - -logger = logging.getLogger(__name__) - -KAFKA = "kafka" -SOURCE = "source" -SINK = "sink" -CONNECTOR_CLASS = "connector.class" - - -class ProvidedConfig(ConfigModel): - provider: str - path_key: str - value: str - - -class GenericConnectorConfig(ConfigModel): - connector_name: str - source_dataset: str - source_platform: str - - -class KafkaConnectSourceConfig( - PlatformInstanceConfigMixin, - DatasetLineageProviderConfigBase, - StatefulIngestionConfigBase, -): - # See the Connect REST Interface for details - # https://docs.confluent.io/platform/current/connect/references/restapi.html# - connect_uri: str = Field( - default="http://localhost:8083/", description="URI to connect to." - ) - username: Optional[str] = Field(default=None, description="Kafka Connect username.") - password: Optional[str] = Field(default=None, description="Kafka Connect password.") - cluster_name: Optional[str] = Field( - default="connect-cluster", description="Cluster to ingest from." - ) - # convert lineage dataset's urns to lowercase - convert_lineage_urns_to_lowercase: bool = Field( - default=False, - description="Whether to convert the urns of ingested lineage dataset to lowercase", - ) - connector_patterns: AllowDenyPattern = Field( - default=AllowDenyPattern.allow_all(), - description="regex patterns for connectors to filter for ingestion.", - ) - provided_configs: Optional[List[ProvidedConfig]] = Field( - default=None, description="Provided Configurations" - ) - connect_to_platform_map: Optional[Dict[str, Dict[str, str]]] = Field( - default=None, - description='Platform instance mapping when multiple instances for a platform is available. Entry for a platform should be in either `platform_instance_map` or `connect_to_platform_map`. e.g.`connect_to_platform_map: { "postgres-connector-finance-db": "postgres": "core_finance_instance" }`', - ) - platform_instance_map: Optional[Dict[str, str]] = Field( - default=None, - description='Platform instance mapping to use when constructing URNs. e.g.`platform_instance_map: { "hive": "warehouse" }`', - ) - generic_connectors: List[GenericConnectorConfig] = Field( - default=[], - description="Provide lineage graph for sources connectors other than Confluent JDBC Source Connector, Debezium Source Connector, and Mongo Source Connector", - ) - - stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None - - -@dataclass -class KafkaConnectSourceReport(StaleEntityRemovalSourceReport): - connectors_scanned: int = 0 - filtered: List[str] = field(default_factory=list) - - def report_connector_scanned(self, connector: str) -> None: - self.connectors_scanned += 1 - - def report_dropped(self, connector: str) -> None: - self.filtered.append(connector) - - -@dataclass -class KafkaConnectLineage: - """Class to store Kafka Connect lineage mapping, Each instance is potential DataJob""" - - source_platform: str - target_dataset: str - target_platform: str - job_property_bag: Optional[Dict[str, str]] = None - source_dataset: Optional[str] = None - - -@dataclass -class ConnectorManifest: - """Each instance is potential DataFlow""" - - name: str - type: str - config: Dict - tasks: Dict - url: Optional[str] = None - flow_property_bag: Optional[Dict[str, str]] = None - lineages: List[KafkaConnectLineage] = field(default_factory=list) - topic_names: Iterable[str] = field(default_factory=list) - - -def remove_prefix(text: str, prefix: str) -> str: - if text.startswith(prefix): - index = len(prefix) - return text[index:] - return text - - -def unquote( - string: str, leading_quote: str = '"', trailing_quote: Optional[str] = None -) -> str: - """ - If string starts and ends with a quote, unquote it - """ - trailing_quote = trailing_quote if trailing_quote else leading_quote - if string.startswith(leading_quote) and string.endswith(trailing_quote): - string = string[1:-1] - return string - - -def get_dataset_name( - database_name: Optional[str], - source_table: str, -) -> str: - if database_name: - dataset_name = database_name + "." + source_table - else: - dataset_name = source_table - - return dataset_name - - -def get_platform_instance( - config: KafkaConnectSourceConfig, connector_name: str, platform: str -) -> Optional[str]: - instance_name = None - if ( - config.connect_to_platform_map - and config.connect_to_platform_map.get(connector_name) - and config.connect_to_platform_map[connector_name].get(platform) - ): - instance_name = config.connect_to_platform_map[connector_name][platform] - if config.platform_instance_map and config.platform_instance_map.get(platform): - logger.warning( - f"Same source platform {platform} configured in both platform_instance_map and connect_to_platform_map." - "Will prefer connector specific platform instance from connect_to_platform_map." - ) - elif config.platform_instance_map and config.platform_instance_map.get(platform): - instance_name = config.platform_instance_map[platform] - logger.info( - f"Instance name assigned is: {instance_name} for Connector Name {connector_name} and platform {platform}" - ) - return instance_name - - -@dataclass -class ConfluentJDBCSourceConnector: - connector_manifest: ConnectorManifest - report: KafkaConnectSourceReport - - def __init__( - self, - connector_manifest: ConnectorManifest, - config: KafkaConnectSourceConfig, - report: KafkaConnectSourceReport, - ) -> None: - self.connector_manifest = connector_manifest - self.config = config - self.report = report - self._extract_lineages() - - REGEXROUTER = "org.apache.kafka.connect.transforms.RegexRouter" - KNOWN_TOPICROUTING_TRANSFORMS = [REGEXROUTER] - # https://kafka.apache.org/documentation/#connect_included_transformation - KAFKA_NONTOPICROUTING_TRANSFORMS = [ - "InsertField", - "InsertField$Key", - "InsertField$Value", - "ReplaceField", - "ReplaceField$Key", - "ReplaceField$Value", - "MaskField", - "MaskField$Key", - "MaskField$Value", - "ValueToKey", - "ValueToKey$Key", - "ValueToKey$Value", - "HoistField", - "HoistField$Key", - "HoistField$Value", - "ExtractField", - "ExtractField$Key", - "ExtractField$Value", - "SetSchemaMetadata", - "SetSchemaMetadata$Key", - "SetSchemaMetadata$Value", - "Flatten", - "Flatten$Key", - "Flatten$Value", - "Cast", - "Cast$Key", - "Cast$Value", - "HeadersFrom", - "HeadersFrom$Key", - "HeadersFrom$Value", - "TimestampConverter", - "Filter", - "InsertHeader", - "DropHeaders", - ] - # https://docs.confluent.io/platform/current/connect/transforms/overview.html - CONFLUENT_NONTOPICROUTING_TRANSFORMS = [ - "Drop", - "Drop$Key", - "Drop$Value", - "Filter", - "Filter$Key", - "Filter$Value", - "TombstoneHandler", - ] - KNOWN_NONTOPICROUTING_TRANSFORMS = ( - KAFKA_NONTOPICROUTING_TRANSFORMS - + [ - f"org.apache.kafka.connect.transforms.{t}" - for t in KAFKA_NONTOPICROUTING_TRANSFORMS - ] - + CONFLUENT_NONTOPICROUTING_TRANSFORMS - + [ - f"io.confluent.connect.transforms.{t}" - for t in CONFLUENT_NONTOPICROUTING_TRANSFORMS - ] - ) - - @dataclass - class JdbcParser: - db_connection_url: str - source_platform: str - database_name: str - topic_prefix: str - query: str - transforms: list - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> JdbcParser: - url = remove_prefix( - str(connector_manifest.config.get("connection.url")), "jdbc:" - ) - url_instance = make_url(url) - source_platform = get_platform_from_sqlalchemy_uri(str(url_instance)) - database_name = url_instance.database - assert database_name - db_connection_url = f"{url_instance.drivername}://{url_instance.host}:{url_instance.port}/{database_name}" - - topic_prefix = self.connector_manifest.config.get("topic.prefix", None) - - query = self.connector_manifest.config.get("query", None) - - transform_names = ( - self.connector_manifest.config.get("transforms", "").split(",") - if self.connector_manifest.config.get("transforms") - else [] - ) - - transforms = [] - for name in transform_names: - transform = {"name": name} - transforms.append(transform) - for key in self.connector_manifest.config.keys(): - if key.startswith(f"transforms.{name}."): - transform[ - key.replace(f"transforms.{name}.", "") - ] = self.connector_manifest.config[key] - - return self.JdbcParser( - db_connection_url, - source_platform, - database_name, - topic_prefix, - query, - transforms, - ) - - def default_get_lineages( - self, - topic_prefix: str, - database_name: str, - source_platform: str, - topic_names: Optional[Iterable[str]] = None, - include_source_dataset: bool = True, - ) -> List[KafkaConnectLineage]: - lineages: List[KafkaConnectLineage] = [] - if not topic_names: - topic_names = self.connector_manifest.topic_names - table_name_tuples: List[Tuple] = self.get_table_names() - for topic in topic_names: - # All good for NO_TRANSFORM or (SINGLE_TRANSFORM and KNOWN_NONTOPICROUTING_TRANSFORM) or (not SINGLE_TRANSFORM and all(KNOWN_NONTOPICROUTING_TRANSFORM)) - source_table: str = ( - remove_prefix(topic, topic_prefix) if topic_prefix else topic - ) - # include schema name for three-level hierarchies - if has_three_level_hierarchy(source_platform): - table_name_tuple: Tuple = next( - iter([t for t in table_name_tuples if t and t[-1] == source_table]), - (), - ) - if len(table_name_tuple) > 1: - source_table = f"{table_name_tuple[-2]}.{source_table}" - else: - include_source_dataset = False - self.report.warning( - "Could not find schema for table" - f"{self.connector_manifest.name} : {source_table}", - ) - dataset_name: str = get_dataset_name(database_name, source_table) - lineage = KafkaConnectLineage( - source_dataset=dataset_name if include_source_dataset else None, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - return lineages - - def get_table_names(self) -> List[Tuple]: - sep: str = "." - leading_quote_char: str = '"' - trailing_quote_char: str = leading_quote_char - - table_ids: List[str] = [] - if self.connector_manifest.tasks: - table_ids = ( - ",".join( - [ - task["config"].get("tables") - for task in self.connector_manifest.tasks - ] - ) - ).split(",") - quote_method = self.connector_manifest.config.get( - "quote.sql.identifiers", "always" - ) - if ( - quote_method == "always" - and table_ids - and table_ids[0] - and table_ids[-1] - ): - leading_quote_char = table_ids[0][0] - trailing_quote_char = table_ids[-1][-1] - # This will only work for single character quotes - elif self.connector_manifest.config.get("table.whitelist"): - table_ids = self.connector_manifest.config.get("table.whitelist").split(",") # type: ignore - - # List of Tuple containing (schema, table) - tables: List[Tuple] = [ - ( - ( - unquote( - table_id.split(sep)[-2], leading_quote_char, trailing_quote_char - ) - if len(table_id.split(sep)) > 1 - else "" - ), - unquote( - table_id.split(sep)[-1], leading_quote_char, trailing_quote_char - ), - ) - for table_id in table_ids - ] - return tables - - def _extract_lineages(self): - lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - source_platform = parser.source_platform - database_name = parser.database_name - query = parser.query - topic_prefix = parser.topic_prefix - transforms = parser.transforms - self.connector_manifest.flow_property_bag = self.connector_manifest.config - - # Mask/Remove properties that may reveal credentials - self.connector_manifest.flow_property_bag[ - "connection.url" - ] = parser.db_connection_url - if "connection.password" in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag["connection.password"] - if "connection.user" in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag["connection.user"] - - logging.debug( - f"Extracting source platform: {source_platform} and database name: {database_name} from connection url " - ) - - if not self.connector_manifest.topic_names: - self.connector_manifest.lineages = lineages - return - - if query: - # Lineage source_table can be extracted by parsing query - for topic in self.connector_manifest.topic_names: - # default method - as per earlier implementation - dataset_name: str = get_dataset_name(database_name, topic) - - lineage = KafkaConnectLineage( - source_dataset=None, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - self.report.warning( - "Could not find input dataset, the connector has query configuration set", - self.connector_manifest.name, - ) - self.connector_manifest.lineages = lineages - return - - SINGLE_TRANSFORM = len(transforms) == 1 - NO_TRANSFORM = len(transforms) == 0 - UNKNOWN_TRANSFORM = any( - [ - transform["type"] - not in self.KNOWN_TOPICROUTING_TRANSFORMS - + self.KNOWN_NONTOPICROUTING_TRANSFORMS - for transform in transforms - ] - ) - ALL_TRANSFORMS_NON_TOPICROUTING = all( - [ - transform["type"] in self.KNOWN_NONTOPICROUTING_TRANSFORMS - for transform in transforms - ] - ) - - if NO_TRANSFORM or ALL_TRANSFORMS_NON_TOPICROUTING: - self.connector_manifest.lineages = self.default_get_lineages( - database_name=database_name, - source_platform=source_platform, - topic_prefix=topic_prefix, - ) - return - - if SINGLE_TRANSFORM and transforms[0]["type"] == self.REGEXROUTER: - tables = self.get_table_names() - topic_names = list(self.connector_manifest.topic_names) - - from java.util.regex import Pattern - - for table in tables: - source_table: str = table[-1] - topic = topic_prefix + source_table if topic_prefix else source_table - - transform_regex = Pattern.compile(transforms[0]["regex"]) - transform_replacement = transforms[0]["replacement"] - - matcher = transform_regex.matcher(topic) - if matcher.matches(): - topic = str(matcher.replaceFirst(transform_replacement)) - - # Additional check to confirm that the topic present - # in connector topics - - if topic in self.connector_manifest.topic_names: - # include schema name for three-level hierarchies - if has_three_level_hierarchy(source_platform) and len(table) > 1: - source_table = f"{table[-2]}.{table[-1]}" - - dataset_name = get_dataset_name(database_name, source_table) - - lineage = KafkaConnectLineage( - source_dataset=dataset_name, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - topic_names.remove(topic) - lineages.append(lineage) - - if topic_names: - lineages.extend( - self.default_get_lineages( - database_name=database_name, - source_platform=source_platform, - topic_prefix=topic_prefix, - topic_names=topic_names, - include_source_dataset=False, - ) - ) - self.report.warning( - "Could not find input dataset for connector topics", - f"{self.connector_manifest.name} : {topic_names}", - ) - self.connector_manifest.lineages = lineages - return - else: - include_source_dataset = True - if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: - self.report.warning( - "Could not find input dataset, connector has unknown transform", - f"{self.connector_manifest.name} : {transforms[0]['type']}", - ) - include_source_dataset = False - if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: - self.report.warning( - "Could not find input dataset, connector has one or more unknown transforms", - self.connector_manifest.name, - ) - include_source_dataset = False - lineages = self.default_get_lineages( - database_name=database_name, - source_platform=source_platform, - topic_prefix=topic_prefix, - include_source_dataset=include_source_dataset, - ) - self.connector_manifest.lineages = lineages - return - - -@dataclass -class MongoSourceConnector: - # https://www.mongodb.com/docs/kafka-connector/current/source-connector/ - - connector_manifest: ConnectorManifest - - def __init__( - self, connector_manifest: ConnectorManifest, config: KafkaConnectSourceConfig - ) -> None: - self.connector_manifest = connector_manifest - self.config = config - self._extract_lineages() - - @dataclass - class MongoSourceParser: - db_connection_url: Optional[str] - source_platform: str - database_name: Optional[str] - topic_prefix: Optional[str] - transforms: List[str] - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> MongoSourceParser: - parser = self.MongoSourceParser( - db_connection_url=connector_manifest.config.get("connection.uri"), - source_platform="mongodb", - database_name=connector_manifest.config.get("database"), - topic_prefix=connector_manifest.config.get("topic_prefix"), - transforms=( - connector_manifest.config["transforms"].split(",") - if "transforms" in connector_manifest.config - else [] - ), - ) - - return parser - - def _extract_lineages(self): - lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - source_platform = parser.source_platform - topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)" - - if not self.connector_manifest.topic_names: - return lineages - - for topic in self.connector_manifest.topic_names: - found = re.search(re.compile(topic_naming_pattern), topic) - - if found: - table_name = get_dataset_name(found.group(1), found.group(2)) - - lineage = KafkaConnectLineage( - source_dataset=table_name, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - self.connector_manifest.lineages = lineages - - -@dataclass -class DebeziumSourceConnector: - connector_manifest: ConnectorManifest - report: KafkaConnectSourceReport - - def __init__( - self, - connector_manifest: ConnectorManifest, - config: KafkaConnectSourceConfig, - report: KafkaConnectSourceReport, - ) -> None: - self.connector_manifest = connector_manifest - self.config = config - self.report = report - self._extract_lineages() - - @dataclass - class DebeziumParser: - source_platform: str - server_name: Optional[str] - database_name: Optional[str] - - def get_server_name(self, connector_manifest: ConnectorManifest) -> str: - if "topic.prefix" in connector_manifest.config: - return connector_manifest.config["topic.prefix"] - else: - return connector_manifest.config.get("database.server.name", "") - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> DebeziumParser: - connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "") - - if connector_class == "io.debezium.connector.mysql.MySqlConnector": - parser = self.DebeziumParser( - source_platform="mysql", - server_name=self.get_server_name(connector_manifest), - database_name=None, - ) - elif connector_class == "MySqlConnector": - parser = self.DebeziumParser( - source_platform="mysql", - server_name=self.get_server_name(connector_manifest), - database_name=None, - ) - elif connector_class == "io.debezium.connector.mongodb.MongoDbConnector": - parser = self.DebeziumParser( - source_platform="mongodb", - server_name=self.get_server_name(connector_manifest), - database_name=None, - ) - elif connector_class == "io.debezium.connector.postgresql.PostgresConnector": - parser = self.DebeziumParser( - source_platform="postgres", - server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("database.dbname"), - ) - elif connector_class == "io.debezium.connector.oracle.OracleConnector": - parser = self.DebeziumParser( - source_platform="oracle", - server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("database.dbname"), - ) - elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector": - database_name = connector_manifest.config.get( - "database.names" - ) or connector_manifest.config.get("database.dbname") - - if "," in str(database_name): - raise Exception( - f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}" - ) - - parser = self.DebeziumParser( - source_platform="mssql", - server_name=self.get_server_name(connector_manifest), - database_name=database_name, - ) - elif connector_class == "io.debezium.connector.db2.Db2Connector": - parser = self.DebeziumParser( - source_platform="db2", - server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("database.dbname"), - ) - elif connector_class == "io.debezium.connector.vitess.VitessConnector": - parser = self.DebeziumParser( - source_platform="vitess", - server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("vitess.keyspace"), - ) - else: - raise ValueError(f"Connector class '{connector_class}' is unknown.") - - return parser - - def _extract_lineages(self): - lineages: List[KafkaConnectLineage] = list() - - try: - parser = self.get_parser(self.connector_manifest) - source_platform = parser.source_platform - server_name = parser.server_name - database_name = parser.database_name - topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)" - - if not self.connector_manifest.topic_names: - return lineages - - for topic in self.connector_manifest.topic_names: - found = re.search(re.compile(topic_naming_pattern), topic) - - if found: - table_name = get_dataset_name(database_name, found.group(2)) - - lineage = KafkaConnectLineage( - source_dataset=table_name, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - self.connector_manifest.lineages = lineages - except Exception as e: - self.report.warning( - "Error resolving lineage for connector", - self.connector_manifest.name, - exc=e, - ) - - return - - -@dataclass -class BigQuerySinkConnector: - connector_manifest: ConnectorManifest - report: KafkaConnectSourceReport - - def __init__( - self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport - ) -> None: - self.connector_manifest = connector_manifest - self.report = report - self._extract_lineages() - - @dataclass - class BQParser: - project: str - target_platform: str - sanitizeTopics: str - transforms: list - topicsToTables: Optional[str] = None - datasets: Optional[str] = None - defaultDataset: Optional[str] = None - version: str = "v1" - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> BQParser: - project = connector_manifest.config["project"] - sanitizeTopics = connector_manifest.config.get("sanitizeTopics", "false") - transform_names = ( - self.connector_manifest.config.get("transforms", "").split(",") - if self.connector_manifest.config.get("transforms") - else [] - ) - transforms = [] - for name in transform_names: - transform = {"name": name} - transforms.append(transform) - for key in self.connector_manifest.config.keys(): - if key.startswith(f"transforms.{name}."): - transform[ - key.replace(f"transforms.{name}.", "") - ] = self.connector_manifest.config[key] - - if "defaultDataset" in connector_manifest.config: - defaultDataset = connector_manifest.config["defaultDataset"] - return self.BQParser( - project=project, - defaultDataset=defaultDataset, - target_platform="bigquery", - sanitizeTopics=sanitizeTopics.lower() == "true", - version="v2", - transforms=transforms, - ) - else: - # version 1.6.x and similar configs supported - datasets = connector_manifest.config["datasets"] - topicsToTables = connector_manifest.config.get("topicsToTables") - - return self.BQParser( - project=project, - topicsToTables=topicsToTables, - datasets=datasets, - target_platform="bigquery", - sanitizeTopics=sanitizeTopics.lower() == "true", - transforms=transforms, - ) - - def get_list(self, property: str) -> Iterable[Tuple[str, str]]: - entries = property.split(",") - for entry in entries: - key, val = entry.rsplit("=") - yield (key.strip(), val.strip()) - - def get_dataset_for_topic_v1(self, topic: str, parser: BQParser) -> Optional[str]: - topicregex_dataset_map: Dict[str, str] = dict(self.get_list(parser.datasets)) # type: ignore - from java.util.regex import Pattern - - for pattern, dataset in topicregex_dataset_map.items(): - patternMatcher = Pattern.compile(pattern).matcher(topic) - if patternMatcher.matches(): - return dataset - return None - - def sanitize_table_name(self, table_name): - table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name) - if re.match("^[^a-zA-Z_].*", table_name): - table_name = "_" + table_name - - return table_name - - def get_dataset_table_for_topic( - self, topic: str, parser: BQParser - ) -> Optional[str]: - if parser.version == "v2": - dataset = parser.defaultDataset - parts = topic.split(":") - if len(parts) == 2: - dataset = parts[0] - table = parts[1] - else: - table = parts[0] - else: - dataset = self.get_dataset_for_topic_v1(topic, parser) - if dataset is None: - return None - - table = topic - if parser.topicsToTables: - topicregex_table_map: Dict[str, str] = dict( - self.get_list(parser.topicsToTables) # type: ignore - ) - from java.util.regex import Pattern - - for pattern, tbl in topicregex_table_map.items(): - patternMatcher = Pattern.compile(pattern).matcher(topic) - if patternMatcher.matches(): - table = tbl - break - - if parser.sanitizeTopics: - table = self.sanitize_table_name(table) - return f"{dataset}.{table}" - - def apply_transformations( - self, topic: str, transforms: List[Dict[str, str]] - ) -> str: - for transform in transforms: - if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter": - regex = transform["regex"] - replacement = transform["replacement"] - pattern = re.compile(regex) - if pattern.match(topic): - topic = pattern.sub(replacement, topic, count=1) - return topic - - def _extract_lineages(self): - lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - if not parser: - return lineages - target_platform = parser.target_platform - project = parser.project - transforms = parser.transforms - self.connector_manifest.flow_property_bag = self.connector_manifest.config - # Mask/Remove properties that may reveal credentials - if "keyfile" in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag["keyfile"] - - for topic in self.connector_manifest.topic_names: - transformed_topic = self.apply_transformations(topic, transforms) - dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser) - if dataset_table is None: - self.report.warning( - "Could not find target dataset for topic, please check your connector configuration" - f"{self.connector_manifest.name} : {transformed_topic} ", - ) - continue - target_dataset = f"{project}.{dataset_table}" - - lineages.append( - KafkaConnectLineage( - source_dataset=transformed_topic, - source_platform=KAFKA, - target_dataset=target_dataset, - target_platform=target_platform, - ) - ) - self.connector_manifest.lineages = lineages - return - - -@dataclass -class SnowflakeSinkConnector: - connector_manifest: ConnectorManifest - report: KafkaConnectSourceReport - - def __init__( - self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport - ) -> None: - self.connector_manifest = connector_manifest - self.report = report - self._extract_lineages() - - @dataclass - class SnowflakeParser: - database_name: str - schema_name: str - topics_to_tables: Dict[str, str] - - def get_table_name_from_topic_name(self, topic_name: str) -> str: - """ - This function converts the topic name to a valid Snowflake table name using some rules. - Refer below link for more info - https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics - """ - table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name) - if re.match("^[^a-zA-Z_].*", table_name): - table_name = "_" + table_name - # Connector may append original topic's hash code as suffix for conflict resolution - # if generated table names for 2 topics are similar. This corner case is not handled here. - # Note that Snowflake recommends to choose topic names that follow the rules for - # Snowflake identifier names so this case is not recommended by snowflake. - return table_name - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> SnowflakeParser: - database_name = connector_manifest.config["snowflake.database.name"] - schema_name = connector_manifest.config["snowflake.schema.name"] - - # Fetch user provided topic to table map - provided_topics_to_tables: Dict[str, str] = {} - if connector_manifest.config.get("snowflake.topic2table.map"): - for each in connector_manifest.config["snowflake.topic2table.map"].split( - "," - ): - topic, table = each.split(":") - provided_topics_to_tables[topic.strip()] = table.strip() - - topics_to_tables: Dict[str, str] = {} - # Extract lineage for only those topics whose data ingestion started - for topic in connector_manifest.topic_names: - if topic in provided_topics_to_tables: - # If user provided which table to get mapped with this topic - topics_to_tables[topic] = provided_topics_to_tables[topic] - else: - # Else connector converts topic name to a valid Snowflake table name. - topics_to_tables[topic] = self.get_table_name_from_topic_name(topic) - - return self.SnowflakeParser( - database_name=database_name, - schema_name=schema_name, - topics_to_tables=topics_to_tables, - ) - - def _extract_lineages(self): - self.connector_manifest.flow_property_bag = self.connector_manifest.config - - # For all snowflake sink connector properties, refer below link - # https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector - # remove private keys, secrets from properties - secret_properties = [ - "snowflake.private.key", - "snowflake.private.key.passphrase", - "value.converter.basic.auth.user.info", - ] - for k in secret_properties: - if k in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag[k] - - lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - - for topic, table in parser.topics_to_tables.items(): - target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}" - lineages.append( - KafkaConnectLineage( - source_dataset=topic, - source_platform=KAFKA, - target_dataset=target_dataset, - target_platform="snowflake", - ) - ) - - self.connector_manifest.lineages = lineages - return - - -@dataclass -class ConfluentS3SinkConnector: - connector_manifest: ConnectorManifest - - def __init__( - self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport - ) -> None: - self.connector_manifest = connector_manifest - self.report = report - self._extract_lineages() - - @dataclass - class S3SinkParser: - target_platform: str - bucket: str - topics_dir: str - topics: Iterable[str] - - def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser: - # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3 - bucket = connector_manifest.config.get("s3.bucket.name") - if not bucket: - raise ValueError( - "Could not find 's3.bucket.name' in connector configuration" - ) - - # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage - topics_dir = connector_manifest.config.get("topics.dir", "topics") - - return self.S3SinkParser( - target_platform="s3", - bucket=bucket, - topics_dir=topics_dir, - topics=connector_manifest.topic_names, - ) - - def _extract_lineages(self): - self.connector_manifest.flow_property_bag = self.connector_manifest.config - - # remove keys, secrets from properties - secret_properties = [ - "aws.access.key.id", - "aws.secret.access.key", - "s3.sse.customer.key", - "s3.proxy.password", - ] - for k in secret_properties: - if k in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag[k] - - try: - parser = self._get_parser(self.connector_manifest) - - lineages: List[KafkaConnectLineage] = list() - for topic in parser.topics: - target_dataset = f"{parser.bucket}/{parser.topics_dir}/{topic}" - - lineages.append( - KafkaConnectLineage( - source_dataset=topic, - source_platform="kafka", - target_dataset=target_dataset, - target_platform=parser.target_platform, - ) - ) - self.connector_manifest.lineages = lineages - except Exception as e: - self.report.warning( - "Error resolving lineage for connector", - self.connector_manifest.name, - exc=e, - ) - - return - - -def transform_connector_config( - connector_config: Dict, provided_configs: List[ProvidedConfig] -) -> None: - """This method will update provided configs in connector config values, if any""" - lookupsByProvider = {} - for pconfig in provided_configs: - lookupsByProvider[f"${{{pconfig.provider}:{pconfig.path_key}}}"] = pconfig.value - for k, v in connector_config.items(): - for key, value in lookupsByProvider.items(): - if key in v: - connector_config[k] = connector_config[k].replace(key, value) - - -@platform_name("Kafka Connect") -@config_class(KafkaConnectSourceConfig) -@support_status(SupportStatus.CERTIFIED) -@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") -@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default") -@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") -class KafkaConnectSource(StatefulIngestionSourceBase): - config: KafkaConnectSourceConfig - report: KafkaConnectSourceReport - platform: str = "kafka-connect" - - def __init__(self, config: KafkaConnectSourceConfig, ctx: PipelineContext): - super().__init__(config, ctx) - self.config = config - self.report = KafkaConnectSourceReport() - self.session = requests.Session() - self.session.headers.update( - { - "Accept": "application/json", - "Content-Type": "application/json", - } - ) - - # Test the connection - if self.config.username is not None and self.config.password is not None: - logger.info( - f"Connecting to {self.config.connect_uri} with Authentication..." - ) - self.session.auth = (self.config.username, self.config.password) - - test_response = self.session.get(f"{self.config.connect_uri}/connectors") - test_response.raise_for_status() - logger.info(f"Connection to {self.config.connect_uri} is ok") - if not jpype.isJVMStarted(): - jpype.startJVM() - - @classmethod - def create(cls, config_dict: dict, ctx: PipelineContext) -> Source: - config = KafkaConnectSourceConfig.parse_obj(config_dict) - return cls(config, ctx) - - def get_connectors_manifest(self) -> List[ConnectorManifest]: - """Get Kafka Connect connectors manifest using REST API. - Enrich with lineages metadata. - """ - connectors_manifest = list() - - connector_response = self.session.get( - f"{self.config.connect_uri}/connectors", - ) - - payload = connector_response.json() - - for connector_name in payload: - connector_url = f"{self.config.connect_uri}/connectors/{connector_name}" - connector_manifest = self._get_connector_manifest( - connector_name, connector_url - ) - if ( - connector_manifest is None - or not self.config.connector_patterns.allowed(connector_manifest.name) - ): - self.report.report_dropped(connector_name) - continue - - if self.config.provided_configs: - transform_connector_config( - connector_manifest.config, self.config.provided_configs - ) - # Initialize connector lineages - connector_manifest.lineages = list() - connector_manifest.url = connector_url - - connector_manifest.topic_names = self._get_connector_topics(connector_name) - - # Populate Source Connector metadata - if connector_manifest.type == SOURCE: - connector_manifest.tasks = self._get_connector_tasks(connector_name) - - # JDBC source connector lineages - if connector_manifest.config.get(CONNECTOR_CLASS).__eq__( - "io.confluent.connect.jdbc.JdbcSourceConnector" - ): - connector_manifest = ConfluentJDBCSourceConnector( - connector_manifest=connector_manifest, - config=self.config, - report=self.report, - ).connector_manifest - elif connector_manifest.config.get(CONNECTOR_CLASS, "").startswith( - "io.debezium.connector" - ): - connector_manifest = DebeziumSourceConnector( - connector_manifest=connector_manifest, - config=self.config, - report=self.report, - ).connector_manifest - elif ( - connector_manifest.config.get(CONNECTOR_CLASS, "") - == "com.mongodb.kafka.connect.MongoSourceConnector" - ): - connector_manifest = MongoSourceConnector( - connector_manifest=connector_manifest, config=self.config - ).connector_manifest - else: - # Find the target connector object in the list, or log an error if unknown. - target_connector = None - for connector in self.config.generic_connectors: - if connector.connector_name == connector_manifest.name: - target_connector = connector - break - if not target_connector: - logger.warning( - f"Detected undefined connector {connector_manifest.name}, which is not in the customized connector list. Please refer to Kafka Connect ingestion recipe to define this customized connector." - ) - continue - - for topic in connector_manifest.topic_names: - lineage = KafkaConnectLineage( - source_dataset=target_connector.source_dataset, - source_platform=target_connector.source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - - connector_manifest.lineages.append(lineage) - - if connector_manifest.type == SINK: - if connector_manifest.config.get(CONNECTOR_CLASS).__eq__( - "com.wepay.kafka.connect.bigquery.BigQuerySinkConnector" - ): - connector_manifest = BigQuerySinkConnector( - connector_manifest=connector_manifest, report=self.report - ).connector_manifest - elif connector_manifest.config.get("connector.class").__eq__( - "io.confluent.connect.s3.S3SinkConnector" - ): - connector_manifest = ConfluentS3SinkConnector( - connector_manifest=connector_manifest, report=self.report - ).connector_manifest - elif connector_manifest.config.get("connector.class").__eq__( - "com.snowflake.kafka.connector.SnowflakeSinkConnector" - ): - connector_manifest = SnowflakeSinkConnector( - connector_manifest=connector_manifest, report=self.report - ).connector_manifest - else: - self.report.report_dropped(connector_manifest.name) - logger.warning( - f"Skipping connector {connector_manifest.name}. Lineage for Connector not yet implemented" - ) - pass - - connectors_manifest.append(connector_manifest) - - return connectors_manifest - - def _get_connector_manifest( - self, connector_name: str, connector_url: str - ) -> Optional[ConnectorManifest]: - try: - connector_response = self.session.get(connector_url) - connector_response.raise_for_status() - except Exception as e: - self.report.warning( - "Failed to get connector details", connector_name, exc=e - ) - return None - manifest = connector_response.json() - connector_manifest = ConnectorManifest(**manifest) - return connector_manifest - - def _get_connector_tasks(self, connector_name: str) -> dict: - try: - response = self.session.get( - f"{self.config.connect_uri}/connectors/{connector_name}/tasks", - ) - response.raise_for_status() - except Exception as e: - self.report.warning( - "Error getting connector tasks", context=connector_name, exc=e - ) - return {} - - return response.json() - - def _get_connector_topics(self, connector_name: str) -> List[str]: - try: - response = self.session.get( - f"{self.config.connect_uri}/connectors/{connector_name}/topics", - ) - response.raise_for_status() - except Exception as e: - self.report.warning( - "Error getting connector topics", context=connector_name, exc=e - ) - return [] - - return response.json()[connector_name]["topics"] - - def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit: - connector_name = connector.name - connector_type = connector.type - connector_class = connector.config.get(CONNECTOR_CLASS) - flow_property_bag = connector.flow_property_bag - # connector_url = connector.url # NOTE: this will expose connector credential when used - flow_urn = builder.make_data_flow_urn( - self.platform, - connector_name, - self.config.env, - self.config.platform_instance, - ) - - return MetadataChangeProposalWrapper( - entityUrn=flow_urn, - aspect=models.DataFlowInfoClass( - name=connector_name, - description=f"{connector_type.capitalize()} connector using `{connector_class}` plugin.", - customProperties=flow_property_bag, - # externalUrl=connector_url, # NOTE: this will expose connector credential when used - ), - ).as_workunit() - - def construct_job_workunits( - self, connector: ConnectorManifest - ) -> Iterable[MetadataWorkUnit]: - connector_name = connector.name - flow_urn = builder.make_data_flow_urn( - self.platform, - connector_name, - self.config.env, - self.config.platform_instance, - ) - - lineages = connector.lineages - if lineages: - for lineage in lineages: - source_dataset = lineage.source_dataset - source_platform = lineage.source_platform - target_dataset = lineage.target_dataset - target_platform = lineage.target_platform - job_property_bag = lineage.job_property_bag - - source_platform_instance = get_platform_instance( - self.config, connector_name, source_platform - ) - target_platform_instance = get_platform_instance( - self.config, connector_name, target_platform - ) - - job_id = self.get_job_id(lineage, connector, self.config) - job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id) - - inlets = ( - [ - self.make_lineage_dataset_urn( - source_platform, source_dataset, source_platform_instance - ) - ] - if source_dataset - else [] - ) - outlets = [ - self.make_lineage_dataset_urn( - target_platform, target_dataset, target_platform_instance - ) - ] - - yield MetadataChangeProposalWrapper( - entityUrn=job_urn, - aspect=models.DataJobInfoClass( - name=f"{connector_name}:{job_id}", - type="COMMAND", - customProperties=job_property_bag, - ), - ).as_workunit() - - yield MetadataChangeProposalWrapper( - entityUrn=job_urn, - aspect=models.DataJobInputOutputClass( - inputDatasets=inlets, - outputDatasets=outlets, - ), - ).as_workunit() - - def get_job_id( - self, - lineage: KafkaConnectLineage, - connector: ConnectorManifest, - config: KafkaConnectSourceConfig, - ) -> str: - connector_class = connector.config.get(CONNECTOR_CLASS) - - # Note - This block is only to maintain backward compatibility of Job URN - if ( - connector_class - and connector.type == SOURCE - and ( - "JdbcSourceConnector" in connector_class - or connector_class.startswith("io.debezium.connector") - ) - and lineage.source_dataset - and config.connect_to_platform_map - and config.connect_to_platform_map.get(connector.name) - and config.connect_to_platform_map[connector.name].get( - lineage.source_platform - ) - ): - return f"{config.connect_to_platform_map[connector.name][lineage.source_platform]}.{lineage.source_dataset}" - - return ( - lineage.source_dataset - if lineage.source_dataset - else f"unknown_source.{lineage.target_dataset}" - ) - - def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: - return [ - *super().get_workunit_processors(), - StaleEntityRemovalHandler.create( - self, self.config, self.ctx - ).workunit_processor, - ] - - def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: - connectors_manifest = self.get_connectors_manifest() - for connector in connectors_manifest: - name = connector.name - - yield self.construct_flow_workunit(connector) - yield from self.construct_job_workunits(connector) - self.report.report_connector_scanned(name) - - def get_report(self) -> KafkaConnectSourceReport: - return self.report - - def make_lineage_dataset_urn( - self, platform: str, name: str, platform_instance: Optional[str] - ) -> str: - if self.config.convert_lineage_urns_to_lowercase: - name = name.lower() - - return builder.make_dataset_urn_with_platform_instance( - platform, name, platform_instance, self.config.env - ) - - -# TODO: Find a more automated way to discover new platforms with 3 level naming hierarchy. -def has_three_level_hierarchy(platform: str) -> bool: - return platform in ["postgres", "trino", "redshift", "snowflake"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py new file mode 100644 index 0000000000000..36f6a96c0d408 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py @@ -0,0 +1,202 @@ +import logging +from dataclasses import dataclass, field +from typing import Dict, Iterable, List, Optional + +from pydantic.fields import Field + +from datahub.configuration.common import AllowDenyPattern, ConfigModel +from datahub.configuration.source_common import ( + DatasetLineageProviderConfigBase, + PlatformInstanceConfigMixin, +) +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalSourceReport, + StatefulStaleMetadataRemovalConfig, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionConfigBase, +) + +logger = logging.getLogger(__name__) + +KAFKA = "kafka" +SOURCE = "source" +SINK = "sink" +CONNECTOR_CLASS = "connector.class" + + +class ProvidedConfig(ConfigModel): + provider: str + path_key: str + value: str + + +class GenericConnectorConfig(ConfigModel): + connector_name: str + source_dataset: str + source_platform: str + + +class KafkaConnectSourceConfig( + PlatformInstanceConfigMixin, + DatasetLineageProviderConfigBase, + StatefulIngestionConfigBase, +): + # See the Connect REST Interface for details + # https://docs.confluent.io/platform/current/connect/references/restapi.html# + connect_uri: str = Field( + default="http://localhost:8083/", description="URI to connect to." + ) + username: Optional[str] = Field(default=None, description="Kafka Connect username.") + password: Optional[str] = Field(default=None, description="Kafka Connect password.") + cluster_name: Optional[str] = Field( + default="connect-cluster", description="Cluster to ingest from." + ) + # convert lineage dataset's urns to lowercase + convert_lineage_urns_to_lowercase: bool = Field( + default=False, + description="Whether to convert the urns of ingested lineage dataset to lowercase", + ) + connector_patterns: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for connectors to filter for ingestion.", + ) + provided_configs: Optional[List[ProvidedConfig]] = Field( + default=None, description="Provided Configurations" + ) + connect_to_platform_map: Optional[Dict[str, Dict[str, str]]] = Field( + default=None, + description='Platform instance mapping when multiple instances for a platform is available. Entry for a platform should be in either `platform_instance_map` or `connect_to_platform_map`. e.g.`connect_to_platform_map: { "postgres-connector-finance-db": "postgres": "core_finance_instance" }`', + ) + platform_instance_map: Optional[Dict[str, str]] = Field( + default=None, + description='Platform instance mapping to use when constructing URNs. e.g.`platform_instance_map: { "hive": "warehouse" }`', + ) + generic_connectors: List[GenericConnectorConfig] = Field( + default=[], + description="Provide lineage graph for sources connectors other than Confluent JDBC Source Connector, Debezium Source Connector, and Mongo Source Connector", + ) + + stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None + + +@dataclass +class KafkaConnectSourceReport(StaleEntityRemovalSourceReport): + connectors_scanned: int = 0 + filtered: List[str] = field(default_factory=list) + + def report_connector_scanned(self, connector: str) -> None: + self.connectors_scanned += 1 + + def report_dropped(self, connector: str) -> None: + self.filtered.append(connector) + + +@dataclass +class KafkaConnectLineage: + """Class to store Kafka Connect lineage mapping, Each instance is potential DataJob""" + + source_platform: str + target_dataset: str + target_platform: str + job_property_bag: Optional[Dict[str, str]] = None + source_dataset: Optional[str] = None + + +@dataclass +class ConnectorManifest: + """Each instance is potential DataFlow""" + + name: str + type: str + config: Dict + tasks: Dict + url: Optional[str] = None + flow_property_bag: Optional[Dict[str, str]] = None + lineages: List[KafkaConnectLineage] = field(default_factory=list) + topic_names: Iterable[str] = field(default_factory=list) + + +def remove_prefix(text: str, prefix: str) -> str: + if text.startswith(prefix): + index = len(prefix) + return text[index:] + return text + + +def unquote( + string: str, leading_quote: str = '"', trailing_quote: Optional[str] = None +) -> str: + """ + If string starts and ends with a quote, unquote it + """ + trailing_quote = trailing_quote if trailing_quote else leading_quote + if string.startswith(leading_quote) and string.endswith(trailing_quote): + string = string[1:-1] + return string + + +def get_dataset_name( + database_name: Optional[str], + source_table: str, +) -> str: + if database_name: + dataset_name = database_name + "." + source_table + else: + dataset_name = source_table + + return dataset_name + + +def get_platform_instance( + config: KafkaConnectSourceConfig, connector_name: str, platform: str +) -> Optional[str]: + instance_name = None + if ( + config.connect_to_platform_map + and config.connect_to_platform_map.get(connector_name) + and config.connect_to_platform_map[connector_name].get(platform) + ): + instance_name = config.connect_to_platform_map[connector_name][platform] + if config.platform_instance_map and config.platform_instance_map.get(platform): + logger.warning( + f"Same source platform {platform} configured in both platform_instance_map and connect_to_platform_map." + "Will prefer connector specific platform instance from connect_to_platform_map." + ) + elif config.platform_instance_map and config.platform_instance_map.get(platform): + instance_name = config.platform_instance_map[platform] + logger.info( + f"Instance name assigned is: {instance_name} for Connector Name {connector_name} and platform {platform}" + ) + return instance_name + + +def transform_connector_config( + connector_config: Dict, provided_configs: List[ProvidedConfig] +) -> None: + """This method will update provided configs in connector config values, if any""" + lookupsByProvider = {} + for pconfig in provided_configs: + lookupsByProvider[f"${{{pconfig.provider}:{pconfig.path_key}}}"] = pconfig.value + for k, v in connector_config.items(): + for key, value in lookupsByProvider.items(): + if key in v: + connector_config[k] = connector_config[k].replace(key, value) + + +# TODO: Find a more automated way to discover new platforms with 3 level naming hierarchy. +def has_three_level_hierarchy(platform: str) -> bool: + return platform in ["postgres", "trino", "redshift", "snowflake"] + + +@dataclass +class BaseConnector: + connector_manifest: ConnectorManifest + config: KafkaConnectSourceConfig + report: KafkaConnectSourceReport + + def extract_lineages(self) -> List[KafkaConnectLineage]: + return [] + + def extract_flow_property_bag(self) -> Optional[Dict[str, str]]: + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py new file mode 100644 index 0000000000000..fa6b614c4b52a --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py @@ -0,0 +1,367 @@ +import logging +from typing import Iterable, List, Optional, Type + +import jpype +import jpype.imports +import requests + +import datahub.emitter.mce_builder as builder +import datahub.metadata.schema_classes as models +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.kafka_connect.common import ( + CONNECTOR_CLASS, + SINK, + SOURCE, + BaseConnector, + ConnectorManifest, + KafkaConnectLineage, + KafkaConnectSourceConfig, + KafkaConnectSourceReport, + get_platform_instance, + transform_connector_config, +) +from datahub.ingestion.source.kafka_connect.sink_connectors import ( + BIGQUERY_SINK_CONNECTOR_CLASS, + S3_SINK_CONNECTOR_CLASS, + SNOWFLAKE_SINK_CONNECTOR_CLASS, + BigQuerySinkConnector, + ConfluentS3SinkConnector, + SnowflakeSinkConnector, +) +from datahub.ingestion.source.kafka_connect.source_connectors import ( + DEBEZIUM_SOURCE_CONNECTOR_PREFIX, + JDBC_SOURCE_CONNECTOR_CLASS, + MONGO_SOURCE_CONNECTOR_CLASS, + ConfigDrivenSourceConnector, + ConfluentJDBCSourceConnector, + DebeziumSourceConnector, + MongoSourceConnector, +) +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalHandler, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionSourceBase, +) + +logger = logging.getLogger(__name__) + + +@platform_name("Kafka Connect") +@config_class(KafkaConnectSourceConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default") +@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") +class KafkaConnectSource(StatefulIngestionSourceBase): + config: KafkaConnectSourceConfig + report: KafkaConnectSourceReport + platform: str = "kafka-connect" + + def __init__(self, config: KafkaConnectSourceConfig, ctx: PipelineContext): + super().__init__(config, ctx) + self.config = config + self.report = KafkaConnectSourceReport() + self.session = requests.Session() + self.session.headers.update( + { + "Accept": "application/json", + "Content-Type": "application/json", + } + ) + + # Test the connection + if self.config.username is not None and self.config.password is not None: + logger.info( + f"Connecting to {self.config.connect_uri} with Authentication..." + ) + self.session.auth = (self.config.username, self.config.password) + + test_response = self.session.get(f"{self.config.connect_uri}/connectors") + test_response.raise_for_status() + logger.info(f"Connection to {self.config.connect_uri} is ok") + if not jpype.isJVMStarted(): + jpype.startJVM() + + @classmethod + def create(cls, config_dict: dict, ctx: PipelineContext) -> Source: + config = KafkaConnectSourceConfig.parse_obj(config_dict) + return cls(config, ctx) + + def get_connectors_manifest(self) -> Iterable[ConnectorManifest]: + """Get Kafka Connect connectors manifest using REST API. + Enrich with lineages metadata. + """ + + connector_response = self.session.get( + f"{self.config.connect_uri}/connectors", + ) + + payload = connector_response.json() + + for connector_name in payload: + connector_url = f"{self.config.connect_uri}/connectors/{connector_name}" + connector_manifest = self._get_connector_manifest( + connector_name, connector_url + ) + if ( + connector_manifest is None + or not self.config.connector_patterns.allowed(connector_manifest.name) + ): + self.report.report_dropped(connector_name) + continue + + if self.config.provided_configs: + transform_connector_config( + connector_manifest.config, self.config.provided_configs + ) + connector_manifest.url = connector_url + connector_manifest.topic_names = self._get_connector_topics(connector_name) + connector_class_value = connector_manifest.config.get(CONNECTOR_CLASS) or "" + + class_type: Type[BaseConnector] = BaseConnector + + # Populate Source Connector metadata + if connector_manifest.type == SOURCE: + connector_manifest.tasks = self._get_connector_tasks(connector_name) + + # JDBC source connector lineages + if connector_class_value == JDBC_SOURCE_CONNECTOR_CLASS: + class_type = ConfluentJDBCSourceConnector + elif connector_class_value.startswith(DEBEZIUM_SOURCE_CONNECTOR_PREFIX): + class_type = DebeziumSourceConnector + elif connector_class_value == MONGO_SOURCE_CONNECTOR_CLASS: + class_type = MongoSourceConnector + elif any( + [ + connector.connector_name == connector_manifest.name + for connector in self.config.generic_connectors + ] + ): + class_type = ConfigDrivenSourceConnector + else: + self.report.report_dropped(connector_manifest.name) + self.report.warning( + "Lineage for Source Connector not supported. " + "Please refer to Kafka Connect docs to use `generic_connectors` config.", + context=f"{connector_manifest.name} of type {connector_class_value}", + ) + continue + elif connector_manifest.type == SINK: + if connector_class_value == BIGQUERY_SINK_CONNECTOR_CLASS: + class_type = BigQuerySinkConnector + elif connector_class_value == S3_SINK_CONNECTOR_CLASS: + class_type = ConfluentS3SinkConnector + elif connector_class_value == SNOWFLAKE_SINK_CONNECTOR_CLASS: + class_type = SnowflakeSinkConnector + else: + self.report.report_dropped(connector_manifest.name) + self.report.warning( + "Lineage for Sink Connector not supported.", + context=f"{connector_manifest.name} of type {connector_class_value}", + ) + + connector_class = class_type(connector_manifest, self.config, self.report) + connector_manifest.lineages = connector_class.extract_lineages() + connector_manifest.flow_property_bag = ( + connector_class.extract_flow_property_bag() + ) + + yield connector_manifest + + def _get_connector_manifest( + self, connector_name: str, connector_url: str + ) -> Optional[ConnectorManifest]: + try: + connector_response = self.session.get(connector_url) + connector_response.raise_for_status() + except Exception as e: + self.report.warning( + "Failed to get connector details", connector_name, exc=e + ) + return None + manifest = connector_response.json() + connector_manifest = ConnectorManifest(**manifest) + return connector_manifest + + def _get_connector_tasks(self, connector_name: str) -> dict: + try: + response = self.session.get( + f"{self.config.connect_uri}/connectors/{connector_name}/tasks", + ) + response.raise_for_status() + except Exception as e: + self.report.warning( + "Error getting connector tasks", context=connector_name, exc=e + ) + return {} + + return response.json() + + def _get_connector_topics(self, connector_name: str) -> List[str]: + try: + response = self.session.get( + f"{self.config.connect_uri}/connectors/{connector_name}/topics", + ) + response.raise_for_status() + except Exception as e: + self.report.warning( + "Error getting connector topics", context=connector_name, exc=e + ) + return [] + + return response.json()[connector_name]["topics"] + + def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit: + connector_name = connector.name + connector_type = connector.type + connector_class = connector.config.get(CONNECTOR_CLASS) + flow_property_bag = connector.flow_property_bag + # connector_url = connector.url # NOTE: this will expose connector credential when used + flow_urn = builder.make_data_flow_urn( + self.platform, + connector_name, + self.config.env, + self.config.platform_instance, + ) + + return MetadataChangeProposalWrapper( + entityUrn=flow_urn, + aspect=models.DataFlowInfoClass( + name=connector_name, + description=f"{connector_type.capitalize()} connector using `{connector_class}` plugin.", + customProperties=flow_property_bag, + # externalUrl=connector_url, # NOTE: this will expose connector credential when used + ), + ).as_workunit() + + def construct_job_workunits( + self, connector: ConnectorManifest + ) -> Iterable[MetadataWorkUnit]: + connector_name = connector.name + flow_urn = builder.make_data_flow_urn( + self.platform, + connector_name, + self.config.env, + self.config.platform_instance, + ) + + lineages = connector.lineages + if lineages: + for lineage in lineages: + source_dataset = lineage.source_dataset + source_platform = lineage.source_platform + target_dataset = lineage.target_dataset + target_platform = lineage.target_platform + job_property_bag = lineage.job_property_bag + + source_platform_instance = get_platform_instance( + self.config, connector_name, source_platform + ) + target_platform_instance = get_platform_instance( + self.config, connector_name, target_platform + ) + + job_id = self.get_job_id(lineage, connector, self.config) + job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id) + + inlets = ( + [ + self.make_lineage_dataset_urn( + source_platform, source_dataset, source_platform_instance + ) + ] + if source_dataset + else [] + ) + outlets = [ + self.make_lineage_dataset_urn( + target_platform, target_dataset, target_platform_instance + ) + ] + + yield MetadataChangeProposalWrapper( + entityUrn=job_urn, + aspect=models.DataJobInfoClass( + name=f"{connector_name}:{job_id}", + type="COMMAND", + customProperties=job_property_bag, + ), + ).as_workunit() + + yield MetadataChangeProposalWrapper( + entityUrn=job_urn, + aspect=models.DataJobInputOutputClass( + inputDatasets=inlets, + outputDatasets=outlets, + ), + ).as_workunit() + + def get_job_id( + self, + lineage: KafkaConnectLineage, + connector: ConnectorManifest, + config: KafkaConnectSourceConfig, + ) -> str: + connector_class = connector.config.get(CONNECTOR_CLASS) + + # Note - This block is only to maintain backward compatibility of Job URN + if ( + connector_class + and connector.type == SOURCE + and ( + "JdbcSourceConnector" in connector_class + or connector_class.startswith("io.debezium.connector") + ) + and lineage.source_dataset + and config.connect_to_platform_map + and config.connect_to_platform_map.get(connector.name) + and config.connect_to_platform_map[connector.name].get( + lineage.source_platform + ) + ): + return f"{config.connect_to_platform_map[connector.name][lineage.source_platform]}.{lineage.source_dataset}" + + return ( + lineage.source_dataset + if lineage.source_dataset + else f"unknown_source.{lineage.target_dataset}" + ) + + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + for connector in self.get_connectors_manifest(): + yield self.construct_flow_workunit(connector) + yield from self.construct_job_workunits(connector) + self.report.report_connector_scanned(connector.name) + + def get_report(self) -> KafkaConnectSourceReport: + return self.report + + def make_lineage_dataset_urn( + self, platform: str, name: str, platform_instance: Optional[str] + ) -> str: + if self.config.convert_lineage_urns_to_lowercase: + name = name.lower() + + return builder.make_dataset_urn_with_platform_instance( + platform, name, platform_instance, self.config.env + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py new file mode 100644 index 0000000000000..2790460c8e601 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py @@ -0,0 +1,341 @@ +import re +from dataclasses import dataclass +from typing import Dict, Iterable, List, Optional, Tuple + +from datahub.ingestion.source.kafka_connect.common import ( + KAFKA, + BaseConnector, + ConnectorManifest, + KafkaConnectLineage, +) + + +@dataclass +class ConfluentS3SinkConnector(BaseConnector): + @dataclass + class S3SinkParser: + target_platform: str + bucket: str + topics_dir: str + topics: Iterable[str] + + def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser: + # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3 + bucket = connector_manifest.config.get("s3.bucket.name") + if not bucket: + raise ValueError( + "Could not find 's3.bucket.name' in connector configuration" + ) + + # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage + topics_dir = connector_manifest.config.get("topics.dir", "topics") + + return self.S3SinkParser( + target_platform="s3", + bucket=bucket, + topics_dir=topics_dir, + topics=connector_manifest.topic_names, + ) + + def extract_flow_property_bag(self) -> Dict[str, str]: + # Mask/Remove properties that may reveal credentials + flow_property_bag = { + k: v + for k, v in self.connector_manifest.config.items() + if k + not in [ + "aws.access.key.id", + "aws.secret.access.key", + "s3.sse.customer.key", + "s3.proxy.password", + ] + } + return flow_property_bag + + def extract_lineages(self) -> List[KafkaConnectLineage]: + try: + parser = self._get_parser(self.connector_manifest) + + lineages: List[KafkaConnectLineage] = list() + for topic in parser.topics: + target_dataset = f"{parser.bucket}/{parser.topics_dir}/{topic}" + + lineages.append( + KafkaConnectLineage( + source_dataset=topic, + source_platform="kafka", + target_dataset=target_dataset, + target_platform=parser.target_platform, + ) + ) + return lineages + except Exception as e: + self.report.warning( + "Error resolving lineage for connector", + self.connector_manifest.name, + exc=e, + ) + + return [] + + +@dataclass +class SnowflakeSinkConnector(BaseConnector): + @dataclass + class SnowflakeParser: + database_name: str + schema_name: str + topics_to_tables: Dict[str, str] + + def get_table_name_from_topic_name(self, topic_name: str) -> str: + """ + This function converts the topic name to a valid Snowflake table name using some rules. + Refer below link for more info + https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics + """ + table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name) + if re.match("^[^a-zA-Z_].*", table_name): + table_name = "_" + table_name + # Connector may append original topic's hash code as suffix for conflict resolution + # if generated table names for 2 topics are similar. This corner case is not handled here. + # Note that Snowflake recommends to choose topic names that follow the rules for + # Snowflake identifier names so this case is not recommended by snowflake. + return table_name + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> SnowflakeParser: + database_name = connector_manifest.config["snowflake.database.name"] + schema_name = connector_manifest.config["snowflake.schema.name"] + + # Fetch user provided topic to table map + provided_topics_to_tables: Dict[str, str] = {} + if connector_manifest.config.get("snowflake.topic2table.map"): + for each in connector_manifest.config["snowflake.topic2table.map"].split( + "," + ): + topic, table = each.split(":") + provided_topics_to_tables[topic.strip()] = table.strip() + + topics_to_tables: Dict[str, str] = {} + # Extract lineage for only those topics whose data ingestion started + for topic in connector_manifest.topic_names: + if topic in provided_topics_to_tables: + # If user provided which table to get mapped with this topic + topics_to_tables[topic] = provided_topics_to_tables[topic] + else: + # Else connector converts topic name to a valid Snowflake table name. + topics_to_tables[topic] = self.get_table_name_from_topic_name(topic) + + return self.SnowflakeParser( + database_name=database_name, + schema_name=schema_name, + topics_to_tables=topics_to_tables, + ) + + def extract_flow_property_bag(self) -> Dict[str, str]: + # For all snowflake sink connector properties, refer below link + # https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector + # remove private keys, secrets from properties + flow_property_bag = { + k: v + for k, v in self.connector_manifest.config.items() + if k + not in [ + "snowflake.private.key", + "snowflake.private.key.passphrase", + "value.converter.basic.auth.user.info", + ] + } + + return flow_property_bag + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + + for topic, table in parser.topics_to_tables.items(): + target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}" + lineages.append( + KafkaConnectLineage( + source_dataset=topic, + source_platform=KAFKA, + target_dataset=target_dataset, + target_platform="snowflake", + ) + ) + + return lineages + + +@dataclass +class BigQuerySinkConnector(BaseConnector): + @dataclass + class BQParser: + project: str + target_platform: str + sanitizeTopics: str + transforms: list + topicsToTables: Optional[str] = None + datasets: Optional[str] = None + defaultDataset: Optional[str] = None + version: str = "v1" + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> BQParser: + project = connector_manifest.config["project"] + sanitizeTopics = connector_manifest.config.get("sanitizeTopics", "false") + transform_names = ( + self.connector_manifest.config.get("transforms", "").split(",") + if self.connector_manifest.config.get("transforms") + else [] + ) + transforms = [] + for name in transform_names: + transform = {"name": name} + transforms.append(transform) + for key in self.connector_manifest.config.keys(): + if key.startswith(f"transforms.{name}."): + transform[ + key.replace(f"transforms.{name}.", "") + ] = self.connector_manifest.config[key] + + if "defaultDataset" in connector_manifest.config: + defaultDataset = connector_manifest.config["defaultDataset"] + return self.BQParser( + project=project, + defaultDataset=defaultDataset, + target_platform="bigquery", + sanitizeTopics=sanitizeTopics.lower() == "true", + version="v2", + transforms=transforms, + ) + else: + # version 1.6.x and similar configs supported + datasets = connector_manifest.config["datasets"] + topicsToTables = connector_manifest.config.get("topicsToTables") + + return self.BQParser( + project=project, + topicsToTables=topicsToTables, + datasets=datasets, + target_platform="bigquery", + sanitizeTopics=sanitizeTopics.lower() == "true", + transforms=transforms, + ) + + def get_list(self, property: str) -> Iterable[Tuple[str, str]]: + entries = property.split(",") + for entry in entries: + key, val = entry.rsplit("=") + yield (key.strip(), val.strip()) + + def get_dataset_for_topic_v1(self, topic: str, parser: BQParser) -> Optional[str]: + topicregex_dataset_map: Dict[str, str] = dict(self.get_list(parser.datasets)) # type: ignore + from java.util.regex import Pattern + + for pattern, dataset in topicregex_dataset_map.items(): + patternMatcher = Pattern.compile(pattern).matcher(topic) + if patternMatcher.matches(): + return dataset + return None + + def sanitize_table_name(self, table_name): + table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name) + if re.match("^[^a-zA-Z_].*", table_name): + table_name = "_" + table_name + + return table_name + + def get_dataset_table_for_topic( + self, topic: str, parser: BQParser + ) -> Optional[str]: + if parser.version == "v2": + dataset = parser.defaultDataset + parts = topic.split(":") + if len(parts) == 2: + dataset = parts[0] + table = parts[1] + else: + table = parts[0] + else: + dataset = self.get_dataset_for_topic_v1(topic, parser) + if dataset is None: + return None + + table = topic + if parser.topicsToTables: + topicregex_table_map: Dict[str, str] = dict( + self.get_list(parser.topicsToTables) # type: ignore + ) + from java.util.regex import Pattern + + for pattern, tbl in topicregex_table_map.items(): + patternMatcher = Pattern.compile(pattern).matcher(topic) + if patternMatcher.matches(): + table = tbl + break + + if parser.sanitizeTopics: + table = self.sanitize_table_name(table) + return f"{dataset}.{table}" + + def apply_transformations( + self, topic: str, transforms: List[Dict[str, str]] + ) -> str: + for transform in transforms: + if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter": + regex = transform["regex"] + replacement = transform["replacement"] + pattern = re.compile(regex) + if pattern.match(topic): + topic = pattern.sub(replacement, topic, count=1) + return topic + + def extract_flow_property_bag(self) -> Dict[str, str]: + # Mask/Remove properties that may reveal credentials + flow_property_bag = { + k: v + for k, v in self.connector_manifest.config.items() + if k not in ["keyfile"] + } + + return flow_property_bag + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + if not parser: + return lineages + target_platform = parser.target_platform + project = parser.project + transforms = parser.transforms + + for topic in self.connector_manifest.topic_names: + transformed_topic = self.apply_transformations(topic, transforms) + dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser) + if dataset_table is None: + self.report.warning( + "Could not find target dataset for topic, please check your connector configuration" + f"{self.connector_manifest.name} : {transformed_topic} ", + ) + continue + target_dataset = f"{project}.{dataset_table}" + + lineages.append( + KafkaConnectLineage( + source_dataset=transformed_topic, + source_platform=KAFKA, + target_dataset=target_dataset, + target_platform=target_platform, + ) + ) + return lineages + + +BIGQUERY_SINK_CONNECTOR_CLASS = "com.wepay.kafka.connect.bigquery.BigQuerySinkConnector" +S3_SINK_CONNECTOR_CLASS = "io.confluent.connect.s3.S3SinkConnector" +SNOWFLAKE_SINK_CONNECTOR_CLASS = "com.snowflake.kafka.connector.SnowflakeSinkConnector" diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py new file mode 100644 index 0000000000000..7b3b6e551a0a1 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py @@ -0,0 +1,570 @@ +import logging +import re +from dataclasses import dataclass +from typing import Dict, Iterable, List, Optional, Tuple + +from sqlalchemy.engine.url import make_url + +from datahub.ingestion.source.kafka_connect.common import ( + CONNECTOR_CLASS, + KAFKA, + BaseConnector, + ConnectorManifest, + KafkaConnectLineage, + get_dataset_name, + has_three_level_hierarchy, + remove_prefix, + unquote, +) +from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( + get_platform_from_sqlalchemy_uri, +) + + +@dataclass +class ConfluentJDBCSourceConnector(BaseConnector): + REGEXROUTER = "org.apache.kafka.connect.transforms.RegexRouter" + KNOWN_TOPICROUTING_TRANSFORMS = [REGEXROUTER] + # https://kafka.apache.org/documentation/#connect_included_transformation + KAFKA_NONTOPICROUTING_TRANSFORMS = [ + "InsertField", + "InsertField$Key", + "InsertField$Value", + "ReplaceField", + "ReplaceField$Key", + "ReplaceField$Value", + "MaskField", + "MaskField$Key", + "MaskField$Value", + "ValueToKey", + "ValueToKey$Key", + "ValueToKey$Value", + "HoistField", + "HoistField$Key", + "HoistField$Value", + "ExtractField", + "ExtractField$Key", + "ExtractField$Value", + "SetSchemaMetadata", + "SetSchemaMetadata$Key", + "SetSchemaMetadata$Value", + "Flatten", + "Flatten$Key", + "Flatten$Value", + "Cast", + "Cast$Key", + "Cast$Value", + "HeadersFrom", + "HeadersFrom$Key", + "HeadersFrom$Value", + "TimestampConverter", + "Filter", + "InsertHeader", + "DropHeaders", + ] + # https://docs.confluent.io/platform/current/connect/transforms/overview.html + CONFLUENT_NONTOPICROUTING_TRANSFORMS = [ + "Drop", + "Drop$Key", + "Drop$Value", + "Filter", + "Filter$Key", + "Filter$Value", + "TombstoneHandler", + ] + KNOWN_NONTOPICROUTING_TRANSFORMS = ( + KAFKA_NONTOPICROUTING_TRANSFORMS + + [ + f"org.apache.kafka.connect.transforms.{t}" + for t in KAFKA_NONTOPICROUTING_TRANSFORMS + ] + + CONFLUENT_NONTOPICROUTING_TRANSFORMS + + [ + f"io.confluent.connect.transforms.{t}" + for t in CONFLUENT_NONTOPICROUTING_TRANSFORMS + ] + ) + + @dataclass + class JdbcParser: + db_connection_url: str + source_platform: str + database_name: str + topic_prefix: str + query: str + transforms: list + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> JdbcParser: + url = remove_prefix( + str(connector_manifest.config.get("connection.url")), "jdbc:" + ) + url_instance = make_url(url) + source_platform = get_platform_from_sqlalchemy_uri(str(url_instance)) + database_name = url_instance.database + assert database_name + db_connection_url = f"{url_instance.drivername}://{url_instance.host}:{url_instance.port}/{database_name}" + + topic_prefix = self.connector_manifest.config.get("topic.prefix", None) + + query = self.connector_manifest.config.get("query", None) + + transform_names = ( + self.connector_manifest.config.get("transforms", "").split(",") + if self.connector_manifest.config.get("transforms") + else [] + ) + + transforms = [] + for name in transform_names: + transform = {"name": name} + transforms.append(transform) + for key in self.connector_manifest.config.keys(): + if key.startswith(f"transforms.{name}."): + transform[ + key.replace(f"transforms.{name}.", "") + ] = self.connector_manifest.config[key] + + return self.JdbcParser( + db_connection_url, + source_platform, + database_name, + topic_prefix, + query, + transforms, + ) + + def default_get_lineages( + self, + topic_prefix: str, + database_name: str, + source_platform: str, + topic_names: Optional[Iterable[str]] = None, + include_source_dataset: bool = True, + ) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = [] + if not topic_names: + topic_names = self.connector_manifest.topic_names + table_name_tuples: List[Tuple] = self.get_table_names() + for topic in topic_names: + # All good for NO_TRANSFORM or (SINGLE_TRANSFORM and KNOWN_NONTOPICROUTING_TRANSFORM) or (not SINGLE_TRANSFORM and all(KNOWN_NONTOPICROUTING_TRANSFORM)) + source_table: str = ( + remove_prefix(topic, topic_prefix) if topic_prefix else topic + ) + # include schema name for three-level hierarchies + if has_three_level_hierarchy(source_platform): + table_name_tuple: Tuple = next( + iter([t for t in table_name_tuples if t and t[-1] == source_table]), + (), + ) + if len(table_name_tuple) > 1: + source_table = f"{table_name_tuple[-2]}.{source_table}" + else: + include_source_dataset = False + self.report.warning( + "Could not find schema for table" + f"{self.connector_manifest.name} : {source_table}", + ) + dataset_name: str = get_dataset_name(database_name, source_table) + lineage = KafkaConnectLineage( + source_dataset=dataset_name if include_source_dataset else None, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + return lineages + + def get_table_names(self) -> List[Tuple]: + sep: str = "." + leading_quote_char: str = '"' + trailing_quote_char: str = leading_quote_char + + table_ids: List[str] = [] + if self.connector_manifest.tasks: + table_ids = ( + ",".join( + [ + task["config"].get("tables") + for task in self.connector_manifest.tasks + ] + ) + ).split(",") + quote_method = self.connector_manifest.config.get( + "quote.sql.identifiers", "always" + ) + if ( + quote_method == "always" + and table_ids + and table_ids[0] + and table_ids[-1] + ): + leading_quote_char = table_ids[0][0] + trailing_quote_char = table_ids[-1][-1] + # This will only work for single character quotes + elif self.connector_manifest.config.get("table.whitelist"): + table_ids = self.connector_manifest.config.get("table.whitelist").split(",") # type: ignore + + # List of Tuple containing (schema, table) + tables: List[Tuple] = [ + ( + ( + unquote( + table_id.split(sep)[-2], leading_quote_char, trailing_quote_char + ) + if len(table_id.split(sep)) > 1 + else "" + ), + unquote( + table_id.split(sep)[-1], leading_quote_char, trailing_quote_char + ), + ) + for table_id in table_ids + ] + return tables + + def extract_flow_property_bag(self) -> Dict[str, str]: + flow_property_bag = { + k: v + for k, v in self.connector_manifest.config.items() + if k not in ["connection.password", "connection.user"] + } + + # Mask/Remove properties that may reveal credentials + flow_property_bag["connection.url"] = self.get_parser( + self.connector_manifest + ).db_connection_url + + return flow_property_bag + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + source_platform = parser.source_platform + database_name = parser.database_name + query = parser.query + topic_prefix = parser.topic_prefix + transforms = parser.transforms + + logging.debug( + f"Extracting source platform: {source_platform} and database name: {database_name} from connection url " + ) + + if not self.connector_manifest.topic_names: + return lineages + + if query: + # Lineage source_table can be extracted by parsing query + for topic in self.connector_manifest.topic_names: + # default method - as per earlier implementation + dataset_name: str = get_dataset_name(database_name, topic) + + lineage = KafkaConnectLineage( + source_dataset=None, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + self.report.warning( + "Could not find input dataset, the connector has query configuration set", + self.connector_manifest.name, + ) + return lineages + + SINGLE_TRANSFORM = len(transforms) == 1 + NO_TRANSFORM = len(transforms) == 0 + UNKNOWN_TRANSFORM = any( + [ + transform["type"] + not in self.KNOWN_TOPICROUTING_TRANSFORMS + + self.KNOWN_NONTOPICROUTING_TRANSFORMS + for transform in transforms + ] + ) + ALL_TRANSFORMS_NON_TOPICROUTING = all( + [ + transform["type"] in self.KNOWN_NONTOPICROUTING_TRANSFORMS + for transform in transforms + ] + ) + + if NO_TRANSFORM or ALL_TRANSFORMS_NON_TOPICROUTING: + return self.default_get_lineages( + database_name=database_name, + source_platform=source_platform, + topic_prefix=topic_prefix, + ) + + if SINGLE_TRANSFORM and transforms[0]["type"] == self.REGEXROUTER: + tables = self.get_table_names() + topic_names = list(self.connector_manifest.topic_names) + + from java.util.regex import Pattern + + for table in tables: + source_table: str = table[-1] + topic = topic_prefix + source_table if topic_prefix else source_table + + transform_regex = Pattern.compile(transforms[0]["regex"]) + transform_replacement = transforms[0]["replacement"] + + matcher = transform_regex.matcher(topic) + if matcher.matches(): + topic = str(matcher.replaceFirst(transform_replacement)) + + # Additional check to confirm that the topic present + # in connector topics + + if topic in self.connector_manifest.topic_names: + # include schema name for three-level hierarchies + if has_three_level_hierarchy(source_platform) and len(table) > 1: + source_table = f"{table[-2]}.{table[-1]}" + + dataset_name = get_dataset_name(database_name, source_table) + + lineage = KafkaConnectLineage( + source_dataset=dataset_name, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + topic_names.remove(topic) + lineages.append(lineage) + + if topic_names: + lineages.extend( + self.default_get_lineages( + database_name=database_name, + source_platform=source_platform, + topic_prefix=topic_prefix, + topic_names=topic_names, + include_source_dataset=False, + ) + ) + self.report.warning( + "Could not find input dataset for connector topics", + f"{self.connector_manifest.name} : {topic_names}", + ) + return lineages + else: + include_source_dataset = True + if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: + self.report.warning( + "Could not find input dataset, connector has unknown transform", + f"{self.connector_manifest.name} : {transforms[0]['type']}", + ) + include_source_dataset = False + if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: + self.report.warning( + "Could not find input dataset, connector has one or more unknown transforms", + self.connector_manifest.name, + ) + include_source_dataset = False + lineages = self.default_get_lineages( + database_name=database_name, + source_platform=source_platform, + topic_prefix=topic_prefix, + include_source_dataset=include_source_dataset, + ) + return lineages + + +@dataclass +class MongoSourceConnector(BaseConnector): + # https://www.mongodb.com/docs/kafka-connector/current/source-connector/ + + @dataclass + class MongoSourceParser: + db_connection_url: Optional[str] + source_platform: str + database_name: Optional[str] + topic_prefix: Optional[str] + transforms: List[str] + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> MongoSourceParser: + parser = self.MongoSourceParser( + db_connection_url=connector_manifest.config.get("connection.uri"), + source_platform="mongodb", + database_name=connector_manifest.config.get("database"), + topic_prefix=connector_manifest.config.get("topic_prefix"), + transforms=( + connector_manifest.config["transforms"].split(",") + if "transforms" in connector_manifest.config + else [] + ), + ) + + return parser + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + source_platform = parser.source_platform + topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)" + + if not self.connector_manifest.topic_names: + return lineages + + for topic in self.connector_manifest.topic_names: + found = re.search(re.compile(topic_naming_pattern), topic) + + if found: + table_name = get_dataset_name(found.group(1), found.group(2)) + + lineage = KafkaConnectLineage( + source_dataset=table_name, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + return lineages + + +@dataclass +class DebeziumSourceConnector(BaseConnector): + @dataclass + class DebeziumParser: + source_platform: str + server_name: Optional[str] + database_name: Optional[str] + + def get_server_name(self, connector_manifest: ConnectorManifest) -> str: + if "topic.prefix" in connector_manifest.config: + return connector_manifest.config["topic.prefix"] + else: + return connector_manifest.config.get("database.server.name", "") + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> DebeziumParser: + connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "") + + if connector_class == "io.debezium.connector.mysql.MySqlConnector": + parser = self.DebeziumParser( + source_platform="mysql", + server_name=self.get_server_name(connector_manifest), + database_name=None, + ) + elif connector_class == "MySqlConnector": + parser = self.DebeziumParser( + source_platform="mysql", + server_name=self.get_server_name(connector_manifest), + database_name=None, + ) + elif connector_class == "io.debezium.connector.mongodb.MongoDbConnector": + parser = self.DebeziumParser( + source_platform="mongodb", + server_name=self.get_server_name(connector_manifest), + database_name=None, + ) + elif connector_class == "io.debezium.connector.postgresql.PostgresConnector": + parser = self.DebeziumParser( + source_platform="postgres", + server_name=self.get_server_name(connector_manifest), + database_name=connector_manifest.config.get("database.dbname"), + ) + elif connector_class == "io.debezium.connector.oracle.OracleConnector": + parser = self.DebeziumParser( + source_platform="oracle", + server_name=self.get_server_name(connector_manifest), + database_name=connector_manifest.config.get("database.dbname"), + ) + elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector": + database_name = connector_manifest.config.get( + "database.names" + ) or connector_manifest.config.get("database.dbname") + + if "," in str(database_name): + raise Exception( + f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}" + ) + + parser = self.DebeziumParser( + source_platform="mssql", + server_name=self.get_server_name(connector_manifest), + database_name=database_name, + ) + elif connector_class == "io.debezium.connector.db2.Db2Connector": + parser = self.DebeziumParser( + source_platform="db2", + server_name=self.get_server_name(connector_manifest), + database_name=connector_manifest.config.get("database.dbname"), + ) + elif connector_class == "io.debezium.connector.vitess.VitessConnector": + parser = self.DebeziumParser( + source_platform="vitess", + server_name=self.get_server_name(connector_manifest), + database_name=connector_manifest.config.get("vitess.keyspace"), + ) + else: + raise ValueError(f"Connector class '{connector_class}' is unknown.") + + return parser + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + + try: + parser = self.get_parser(self.connector_manifest) + source_platform = parser.source_platform + server_name = parser.server_name + database_name = parser.database_name + topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)" + + if not self.connector_manifest.topic_names: + return lineages + + for topic in self.connector_manifest.topic_names: + found = re.search(re.compile(topic_naming_pattern), topic) + + if found: + table_name = get_dataset_name(database_name, found.group(2)) + + lineage = KafkaConnectLineage( + source_dataset=table_name, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + return lineages + except Exception as e: + self.report.warning( + "Error resolving lineage for connector", + self.connector_manifest.name, + exc=e, + ) + + return [] + + +@dataclass +class ConfigDrivenSourceConnector(BaseConnector): + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages = [] + for connector in self.config.generic_connectors: + if connector.connector_name == self.connector_manifest.name: + target_connector = connector + break + for topic in self.connector_manifest.topic_names: + lineage = KafkaConnectLineage( + source_dataset=target_connector.source_dataset, + source_platform=target_connector.source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + return lineages + + +JDBC_SOURCE_CONNECTOR_CLASS = "io.confluent.connect.jdbc.JdbcSourceConnector" +DEBEZIUM_SOURCE_CONNECTOR_PREFIX = "io.debezium.connector" +MONGO_SOURCE_CONNECTOR_CLASS = "com.mongodb.kafka.connect.MongoSourceConnector" From 2e544614f12bf2ad8e758b2fd742ee14c6998825 Mon Sep 17 00:00:00 2001 From: sagar-salvi-apptware <159135491+sagar-salvi-apptware@users.noreply.github.com> Date: Thu, 19 Dec 2024 12:41:40 +0530 Subject: [PATCH 8/8] feat(ingest): add looker meta extractor support in sql parsing (#12062) Co-authored-by: Mayuri N Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> --- .../datahub/configuration/source_common.py | 13 ++ .../ingestion/source/looker/looker_common.py | 56 +++++- .../source/looker/looker_lib_wrapper.py | 14 +- .../ingestion/source/looker/looker_source.py | 13 +- .../ingestion/source/powerbi/config.py | 15 +- .../powerbi/dataplatform_instance_resolver.py | 2 +- .../source/powerbi/m_query/pattern_handler.py | 2 +- .../source/snowflake/snowflake_v2.py | 1 + .../sql_parsing/sql_parsing_aggregator.py | 2 +- .../sql_parsing/tool_meta_extractor.py | 121 ++++++++++++- .../looker/golden_looker_mces.json | 56 ++++++ .../looker/golden_test_allow_ingest.json | 53 ++++++ ...olden_test_external_project_view_mces.json | 53 ++++++ .../looker/golden_test_file_path_ingest.json | 53 ++++++ ...olden_test_folder_path_pattern_ingest.json | 53 ++++++ .../golden_test_independent_look_ingest.json | 170 +++++++++++++----- .../looker/golden_test_ingest.json | 54 ++++++ .../looker/golden_test_ingest_joins.json | 53 ++++++ .../golden_test_ingest_unaliased_joins.json | 53 ++++++ ...en_test_non_personal_independent_look.json | 71 ++++++++ .../looker_mces_golden_deleted_stateful.json | 68 ++++++- .../looker/looker_mces_usage_history.json | 53 ++++++ .../tests/integration/looker/test_looker.py | 20 +++ .../sql_parsing/test_tool_meta_extractor.py | 44 ++++- .../state/test_redundant_run_skip_handler.py | 6 +- .../platformresource/PlatformResourceType.pdl | 6 +- 26 files changed, 1026 insertions(+), 79 deletions(-) diff --git a/metadata-ingestion/src/datahub/configuration/source_common.py b/metadata-ingestion/src/datahub/configuration/source_common.py index 44c737f1bd13d..8e41e9fb91787 100644 --- a/metadata-ingestion/src/datahub/configuration/source_common.py +++ b/metadata-ingestion/src/datahub/configuration/source_common.py @@ -63,3 +63,16 @@ class DatasetLineageProviderConfigBase(EnvConfigMixin): default=None, description="A holder for platform -> platform_instance mappings to generate correct dataset urns", ) + + +class PlatformDetail(ConfigModel): + platform_instance: Optional[str] = Field( + default=None, + description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match " + "with platform instance name used in ingestion " + "recipe of other datahub sources.", + ) + env: str = Field( + default=DEFAULT_ENV, + description="The environment that all assets produced by DataHub platform ingestion source belong to", + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 57a251ef2ed14..a66962f962255 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -31,6 +31,10 @@ from pydantic.class_validators import validator import datahub.emitter.mce_builder as builder +from datahub.api.entities.platformresource.platform_resource import ( + PlatformResource, + PlatformResourceKey, +) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import ContainerKey, create_embed_mcp from datahub.ingestion.api.report import Report @@ -106,7 +110,7 @@ from datahub.utilities.url_util import remove_port_from_url CORPUSER_DATAHUB = "urn:li:corpuser:datahub" - +LOOKER = "looker" logger = logging.getLogger(__name__) @@ -1411,6 +1415,7 @@ class LookerDashboardSourceReport(StaleEntityRemovalSourceReport): resolved_user_ids: int = 0 email_ids_missing: int = 0 # resolved users with missing email addresses + looker_user_count: int = 0 _looker_api: Optional[LookerAPI] = None query_latency: Dict[str, datetime.timedelta] = dataclasses_field( @@ -1614,9 +1619,21 @@ def get_urn_dashboard_id(self): class LookerUserRegistry: looker_api_wrapper: LookerAPI fields: str = ",".join(["id", "email", "display_name", "first_name", "last_name"]) + _user_cache: Dict[str, LookerUser] = {} - def __init__(self, looker_api: LookerAPI): + def __init__(self, looker_api: LookerAPI, report: LookerDashboardSourceReport): self.looker_api_wrapper = looker_api + self.report = report + self._initialize_user_cache() + + def _initialize_user_cache(self) -> None: + raw_users: Sequence[User] = self.looker_api_wrapper.all_users( + user_fields=self.fields + ) + + for raw_user in raw_users: + looker_user = LookerUser.create_looker_user(raw_user) + self._user_cache[str(looker_user.id)] = looker_user def get_by_id(self, id_: str) -> Optional[LookerUser]: if not id_: @@ -1624,6 +1641,9 @@ def get_by_id(self, id_: str) -> Optional[LookerUser]: logger.debug(f"Will get user {id_}") + if str(id_) in self._user_cache: + return self._user_cache.get(str(id_)) + raw_user: Optional[User] = self.looker_api_wrapper.get_user( str(id_), user_fields=self.fields ) @@ -1632,3 +1652,35 @@ def get_by_id(self, id_: str) -> Optional[LookerUser]: looker_user = LookerUser.create_looker_user(raw_user) return looker_user + + def to_platform_resource( + self, platform_instance: Optional[str] + ) -> Iterable[MetadataChangeProposalWrapper]: + try: + platform_resource_key = PlatformResourceKey( + platform=LOOKER, + resource_type="USER_ID_MAPPING", + platform_instance=platform_instance, + primary_key="", + ) + + # Extract user email mappings + user_email_cache = { + user_id: user.email + for user_id, user in self._user_cache.items() + if user.email + } + + platform_resource = PlatformResource.create( + key=platform_resource_key, + value=user_email_cache, + ) + + self.report.looker_user_count = len(user_email_cache) + yield from platform_resource.to_mcps() + + except Exception as exc: + self.report.warning( + message="Failed to generate platform resource for looker id mappings", + exc=exc, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py index ab55d4e15e5de..c3f2a110136c4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py @@ -68,6 +68,7 @@ class LookerAPIStats(BaseModel): get_look_calls: int = 0 search_looks_calls: int = 0 search_dashboards_calls: int = 0 + all_user_calls: int = 0 class LookerAPI: @@ -135,7 +136,7 @@ def get_available_permissions(self) -> Set[str]: return permissions - @lru_cache(maxsize=1000) + @lru_cache(maxsize=5000) def get_user(self, id_: str, user_fields: str) -> Optional[User]: self.client_stats.user_calls += 1 try: @@ -154,6 +155,17 @@ def get_user(self, id_: str, user_fields: str) -> Optional[User]: # User not found return None + def all_users(self, user_fields: str) -> Sequence[User]: + self.client_stats.all_user_calls += 1 + try: + return self.client.all_users( + fields=cast(str, user_fields), + transport_options=self.transport_options, + ) + except SDKError as e: + logger.warning(f"Failure was {e}") + return [] + def execute_query(self, write_query: WriteQuery) -> List[Dict]: logger.debug(f"Executing query {write_query}") self.client_stats.query_calls += 1 diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index cd8ccb8217257..815c5dfb1c014 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -145,7 +145,9 @@ def __init__(self, config: LookerDashboardSourceConfig, ctx: PipelineContext): self.source_config: LookerDashboardSourceConfig = config self.reporter: LookerDashboardSourceReport = LookerDashboardSourceReport() self.looker_api: LookerAPI = LookerAPI(self.source_config) - self.user_registry: LookerUserRegistry = LookerUserRegistry(self.looker_api) + self.user_registry: LookerUserRegistry = LookerUserRegistry( + self.looker_api, self.reporter + ) self.explore_registry: LookerExploreRegistry = LookerExploreRegistry( self.looker_api, self.reporter, self.source_config ) @@ -1673,5 +1675,14 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield usage_mcp.as_workunit() self.reporter.report_stage_end("usage_extraction") + # Dump looker user resource mappings. + logger.info("Ingesting looker user resource mapping workunits") + self.reporter.report_stage_start("user_resource_extraction") + yield from auto_workunit( + self.user_registry.to_platform_resource( + self.source_config.platform_instance + ) + ) + def get_report(self) -> SourceReport: return self.reporter diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index f7458c4eb4d5b..b49d40a0c7eb6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -9,7 +9,7 @@ import datahub.emitter.mce_builder as builder from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.ingestion.source.common.subtypes import BIAssetSubTypes from datahub.ingestion.source.state.stale_entity_removal_handler import ( @@ -232,19 +232,6 @@ def default_for_dataset_type_mapping() -> Dict[str, str]: return dict_ -class PlatformDetail(ConfigModel): - platform_instance: Optional[str] = pydantic.Field( - default=None, - description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match " - "with platform instance name used in ingestion " - "recipe of other datahub sources.", - ) - env: str = pydantic.Field( - default=builder.DEFAULT_ENV, - description="The environment that all assets produced by DataHub platform ingestion source belong to", - ) - - class DataBricksPlatformDetail(PlatformDetail): """ metastore is an additional field used in Databricks connector to generate the dataset urn diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py index baaa8d5b85ae1..6d51e853a2fb0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py @@ -2,8 +2,8 @@ from abc import ABC, abstractmethod from typing import Union +from datahub.configuration.source_common import PlatformDetail from datahub.ingestion.source.powerbi.config import ( - PlatformDetail, PowerBiDashboardSourceConfig, PowerBIPlatformDetail, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py index ffaed79f4e42a..63520bd731de8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py @@ -5,13 +5,13 @@ from lark import Tree +from datahub.configuration.source_common import PlatformDetail from datahub.emitter import mce_builder as builder from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.powerbi.config import ( Constant, DataBricksPlatformDetail, DataPlatformPair, - PlatformDetail, PowerBiDashboardSourceConfig, PowerBiDashboardSourceReport, PowerBIPlatformDetail, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index c3a7912c40e8e..e5883dd0349a3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -540,6 +540,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: identifiers=self.identifiers, schema_resolver=schema_resolver, discovered_tables=discovered_datasets, + graph=self.ctx.graph, ) # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index 79ea98d1c7f54..f81eb291e89e1 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -490,7 +490,7 @@ def __init__( self._exit_stack.push(self._query_usage_counts) # Tool Extractor - self._tool_meta_extractor = ToolMetaExtractor() + self._tool_meta_extractor = ToolMetaExtractor.create(graph) self.report.tool_meta_report = self._tool_meta_extractor.report def close(self) -> None: diff --git a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py index 0d85002776e5e..5af9d9d4f0fff 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py +++ b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py @@ -1,3 +1,4 @@ +import contextlib import json import logging from dataclasses import dataclass, field @@ -5,8 +6,15 @@ from typing_extensions import Protocol +from datahub.api.entities.platformresource.platform_resource import ( + ElasticPlatformResourceQuery, + PlatformResource, + PlatformResourceSearchFields, +) from datahub.ingestion.api.report import Report +from datahub.ingestion.graph.client import DataHubGraph from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn +from datahub.utilities.search_utils import LogicalOperator from datahub.utilities.stats_collections import int_top_k_dict UrnStr = str @@ -31,6 +39,7 @@ def _get_last_line(query: str) -> str: @dataclass class ToolMetaExtractorReport(Report): num_queries_meta_extracted: Dict[str, int] = field(default_factory=int_top_k_dict) + failures: List[str] = field(default_factory=list) class ToolMetaExtractor: @@ -42,14 +51,81 @@ class ToolMetaExtractor: by warehouse query logs. """ - def __init__(self) -> None: - self.report = ToolMetaExtractorReport() + def __init__( + self, + report: ToolMetaExtractorReport, + looker_user_mapping: Optional[Dict[str, str]] = None, + ) -> None: + self.report = report self.known_tool_extractors: List[Tuple[str, Callable[[QueryLog], bool]]] = [ ( "mode", self._extract_mode_query, - ) + ), + ( + "looker", + self._extract_looker_query, + ), ] + # maps user id (as string) to email address + self.looker_user_mapping = looker_user_mapping + + @classmethod + def create( + cls, + graph: Optional[DataHubGraph] = None, + ) -> "ToolMetaExtractor": + report = ToolMetaExtractorReport() + looker_user_mapping = None + if graph: + try: + looker_user_mapping = cls.extract_looker_user_mapping_from_graph( + graph, report + ) + except Exception as e: + report.failures.append( + f"Unexpected error during Looker user metadata extraction: {str(e)}" + ) + + return cls(report, looker_user_mapping) + + @classmethod + def extract_looker_user_mapping_from_graph( + cls, graph: DataHubGraph, report: ToolMetaExtractorReport + ) -> Optional[Dict[str, str]]: + looker_user_mapping = None + query = ( + ElasticPlatformResourceQuery.create_from() + .group(LogicalOperator.AND) + .add_field_match(PlatformResourceSearchFields.PLATFORM, "looker") + .add_field_match( + PlatformResourceSearchFields.RESOURCE_TYPE, + "USER_ID_MAPPING", + ) + .end() + ) + platform_resources = list( + PlatformResource.search_by_filters(query=query, graph_client=graph) + ) + + if len(platform_resources) > 1: + report.failures.append( + "Looker user metadata extraction failed. Found more than one looker user id mappings." + ) + else: + platform_resource = platform_resources[0] + + if ( + platform_resource + and platform_resource.resource_info + and platform_resource.resource_info.value + ): + with contextlib.suppress(ValueError, AssertionError): + value = platform_resource.resource_info.value.as_raw_json() + if value: + looker_user_mapping = value + + return looker_user_mapping def _extract_mode_query(self, entry: QueryLog) -> bool: """ @@ -78,14 +154,49 @@ def _extract_mode_query(self, entry: QueryLog) -> bool: return True + def _extract_looker_query(self, entry: QueryLog) -> bool: + """ + Returns: + bool: whether QueryLog entry is that of looker and looker user info + is extracted into entry. + """ + if not self.looker_user_mapping: + return False + + last_line = _get_last_line(entry.query_text) + + if not (last_line.startswith("--") and "Looker Query Context" in last_line): + return False + + start_quote_idx = last_line.index("'") + end_quote_idx = last_line.rindex("'") + if start_quote_idx == -1 or end_quote_idx == -1: + return False + + looker_json_raw = last_line[start_quote_idx + 1 : end_quote_idx] + looker_json = json.loads(looker_json_raw) + + user_id = str(looker_json["user_id"]) + email = self.looker_user_mapping.get(user_id) + if not email: + return False + + original_user = entry.user + + entry.user = email_to_user_urn(email) + entry.extra_info = entry.extra_info or {} + entry.extra_info["user_via"] = original_user + + return True + def extract_bi_metadata(self, entry: QueryLog) -> bool: for tool, meta_extractor in self.known_tool_extractors: try: if meta_extractor(entry): self.report.num_queries_meta_extracted[tool] += 1 return True - except Exception: - logger.debug("Tool metadata extraction failed with error : {e}") + except Exception as e: + logger.debug(f"Tool metadata extraction failed with error : {e}") return False diff --git a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json index a9c445b5986ef..6ae772c134cb3 100644 --- a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json @@ -842,6 +842,62 @@ "pipelineName": "stateful-looker-pipeline" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json index af9c62a2a4180..d7620980a9ced 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json @@ -497,6 +497,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json index b89bc356b48fd..13963af55bfe5 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json @@ -735,6 +735,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json index 810fefd8f6cb8..f11d060102851 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json @@ -735,6 +735,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json index 3d78397f54a23..f6e39dd5286cd 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json @@ -828,6 +828,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json index 5a540e61e768d..203bed843155c 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json @@ -464,6 +464,21 @@ "/Folders/Shared" ] } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:test-1@looker.com", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] } @@ -708,6 +723,21 @@ "/Folders/Personal" ] } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:test-2@looker.com", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] } @@ -1108,12 +1138,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/sales_model" + "/Explore/data" ] } }, @@ -1126,12 +1156,12 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "project": "lkml_samples", - "model": "sales_model", + "model": "data", "looker.explore.label": "My Explore View", - "looker.explore.name": "sales_explore", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore", + "externalUrl": "https://looker.company.com/explore/data/my_view", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1153,7 +1183,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "sales_explore", + "schemaName": "my_view", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1208,7 +1238,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1227,12 +1257,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore" + "renderUrl": "https://looker.company.com/embed/explore/data/my_view" } }, "systemMetadata": { @@ -1244,12 +1274,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" + "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" } }, "systemMetadata": { @@ -1261,7 +1291,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1271,8 +1301,8 @@ "id": "Explore" }, { - "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5", - "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" + "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42", + "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" } ] } @@ -1287,12 +1317,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/data" + "/Explore/order_model" ] } }, @@ -1305,12 +1335,12 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "project": "lkml_samples", - "model": "data", + "model": "order_model", "looker.explore.label": "My Explore View", - "looker.explore.name": "my_view", + "looker.explore.name": "order_explore", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/data/my_view", + "externalUrl": "https://looker.company.com/explore/order_model/order_explore", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1332,7 +1362,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "my_view", + "schemaName": "order_explore", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1387,7 +1417,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1406,12 +1436,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/data/my_view" + "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore" } }, "systemMetadata": { @@ -1423,12 +1453,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" + "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" } }, "systemMetadata": { @@ -1440,7 +1470,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1450,8 +1480,8 @@ "id": "Explore" }, { - "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42", - "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" + "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60", + "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" } ] } @@ -1466,12 +1496,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/order_model" + "/Explore/sales_model" ] } }, @@ -1484,12 +1514,12 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "project": "lkml_samples", - "model": "order_model", + "model": "sales_model", "looker.explore.label": "My Explore View", - "looker.explore.name": "order_explore", + "looker.explore.name": "sales_explore", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/order_model/order_explore", + "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1511,7 +1541,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "order_explore", + "schemaName": "sales_explore", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1566,7 +1596,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1585,12 +1615,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore" + "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore" } }, "systemMetadata": { @@ -1602,12 +1632,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" + "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" } }, "systemMetadata": { @@ -1619,7 +1649,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1629,8 +1659,8 @@ "id": "Explore" }, { - "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60", - "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" + "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5", + "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" } ] } @@ -1705,6 +1735,62 @@ "pipelineName": "execution-1" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json index 9ac95b8482a47..87af50f95ed6b 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json @@ -793,6 +793,60 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:8436a2a37c4a7e81fb08c9c8415d2e4b", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:8436a2a37c4a7e81fb08c9c8415d2e4b", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:looker,ap-south-1)" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:8436a2a37c4a7e81fb08c9c8415d2e4b", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json index 3a2c6359ea63c..b990ce7c67dab 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json @@ -759,6 +759,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json index 007eee348aeaf..391192b3d16f3 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json @@ -513,6 +513,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json index 859b9163d7aad..4909a6af73a22 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json @@ -464,6 +464,21 @@ "/Folders/Shared" ] } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:test-1@looker.com", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] } @@ -1185,6 +1200,62 @@ "pipelineName": "execution-1" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json index 8256c984afb27..ddeb5428b1d72 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json @@ -762,6 +762,62 @@ "pipelineName": "stateful-looker-pipeline" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", @@ -814,8 +870,8 @@ } }, { - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(looker,dashboards.11)", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,bogus data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -831,8 +887,8 @@ } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(looker,dashboards.11)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -865,8 +921,8 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,bogus data.explore.my_view,PROD)", + "entityType": "chart", + "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", "changeType": "UPSERT", "aspectName": "status", "aspect": { diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json index 0b3530f9c2462..594983c8fb0f2 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json @@ -678,6 +678,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", diff --git a/metadata-ingestion/tests/integration/looker/test_looker.py b/metadata-ingestion/tests/integration/looker/test_looker.py index 8bbf14709ff9f..a39de8384efb2 100644 --- a/metadata-ingestion/tests/integration/looker/test_looker.py +++ b/metadata-ingestion/tests/integration/looker/test_looker.py @@ -83,6 +83,7 @@ def test_looker_ingest(pytestconfig, tmp_path, mock_time): with mock.patch("looker_sdk.init40") as mock_sdk: mock_sdk.return_value = mocked_client setup_mock_dashboard(mocked_client) + mocked_client.run_inline_query.side_effect = side_effect_query_inline setup_mock_explore(mocked_client) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" @@ -319,6 +320,7 @@ def setup_mock_look(mocked_client): mocked_client.all_looks.return_value = [ Look( id="1", + user_id="1", title="Outer Look", description="I am not part of any Dashboard", query_id="1", @@ -327,6 +329,7 @@ def setup_mock_look(mocked_client): Look( id="2", title="Personal Look", + user_id="2", description="I am not part of any Dashboard and in personal folder", query_id="2", folder=FolderBase( @@ -561,6 +564,20 @@ def get_user( mocked_client.user.side_effect = get_user +def setup_mock_all_user(mocked_client): + def all_users( + fields: Optional[str] = None, + transport_options: Optional[transport.TransportOptions] = None, + ) -> List[User]: + return [ + User(id="1", email="test-1@looker.com"), + User(id="2", email="test-2@looker.com"), + User(id="3", email="test-3@looker.com"), + ] + + mocked_client.all_users.side_effect = all_users + + def side_effect_query_inline( result_format: str, body: WriteQuery, transport_options: Optional[TransportOptions] ) -> str: @@ -714,6 +731,7 @@ def test_looker_ingest_usage_history(pytestconfig, tmp_path, mock_time): mocked_client.run_inline_query.side_effect = side_effect_query_inline setup_mock_explore(mocked_client) setup_mock_user(mocked_client) + setup_mock_all_user(mocked_client) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" @@ -946,6 +964,8 @@ def ingest_independent_looks( mock_sdk.return_value = mocked_client setup_mock_dashboard(mocked_client) setup_mock_explore(mocked_client) + setup_mock_user(mocked_client) + setup_mock_all_user(mocked_client) setup_mock_look(mocked_client) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py b/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py index 6f590b5307146..f6566f007f5e6 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py @@ -1,11 +1,14 @@ from datahub.configuration.datetimes import parse_absolute_time from datahub.metadata.urns import CorpUserUrn from datahub.sql_parsing.sql_parsing_aggregator import PreparsedQuery -from datahub.sql_parsing.tool_meta_extractor import ToolMetaExtractor +from datahub.sql_parsing.tool_meta_extractor import ( + ToolMetaExtractor, + ToolMetaExtractorReport, +) def test_extract_mode_metadata() -> None: - extractor = ToolMetaExtractor() + extractor = ToolMetaExtractor(report=ToolMetaExtractorReport()) query = """\ select * from LONG_TAIL_COMPANIONS.ADOPTION.PET_PROFILES LIMIT 100 @@ -30,8 +33,42 @@ def test_extract_mode_metadata() -> None: assert extractor.report.num_queries_meta_extracted["mode"] == 1 +def test_extract_looker_metadata() -> None: + extractor = ToolMetaExtractor( + report=ToolMetaExtractorReport(), looker_user_mapping={"7": "john.doe@xyz.com"} + ) + looker_query = """\ +SELECT + all_entities_extended_sibling."ENTITY" AS "all_entities_extended_sibling.entity_type", + COUNT(DISTINCT ( all_entities_extended_sibling."URN" )) AS "all_entities_extended_sibling.distinct_count" +FROM "PUBLIC"."ALL_ENTITIES" + AS all_entities_extended_sibling +GROUP BY + 1 +ORDER BY + 1 +FETCH NEXT 50 ROWS ONLY +-- Looker Query Context '{"user_id":7,"history_slug":"264797031bc403cf382cbefbe3700849","instance_slug":"32654f2ffadf10b1949d4009e52fc6a4"}' +""" + + entry = PreparsedQuery( + query_id=None, + query_text=looker_query, + upstreams=[], + downstream=None, + column_lineage=None, + column_usage=None, + inferred_schema=None, + user=CorpUserUrn("mode"), + timestamp=parse_absolute_time("2021-08-01T01:02:03Z"), + ) + assert extractor.extract_bi_metadata(entry) + assert entry.user == CorpUserUrn("john.doe") + assert extractor.report.num_queries_meta_extracted["looker"] == 1 + + def test_extract_no_metadata() -> None: - extractor = ToolMetaExtractor() + extractor = ToolMetaExtractor(report=ToolMetaExtractorReport()) query = """\ select * from LONG_TAIL_COMPANIONS.ADOPTION.PET_PROFILES LIMIT 100 @@ -53,3 +90,4 @@ def test_extract_no_metadata() -> None: assert not extractor.extract_bi_metadata(entry) assert extractor.report.num_queries_meta_extracted["mode"] == 0 + assert extractor.report.num_queries_meta_extracted["looker"] == 0 diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py index 85c86f8d205d9..5631ad2c69f94 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py @@ -37,7 +37,11 @@ def stateful_source(mock_datahub_graph: DataHubGraph) -> Iterable[SnowflakeV2Sou ), ) - with mock.patch("snowflake.connector.connect"): + with mock.patch( + "datahub.sql_parsing.sql_parsing_aggregator.ToolMetaExtractor.create", + ) as mock_checkpoint, mock.patch("snowflake.connector.connect"): + mock_checkpoint.return_value = mock.MagicMock() + yield SnowflakeV2Source(ctx=ctx, config=config) diff --git a/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl b/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl index 2f36eda9141ab..1a1dbea4359fb 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl @@ -9,9 +9,13 @@ enum PlatformResourceType { /** * e.g. a Slack member resource, Looker user resource, etc. */ - USER_INFO, + USER_INFO, /** * e.g. a Slack channel */ CONVERSATION + /** + * e.g. Looker mapping of all user ids + */ + USER_ID_MAPPING }