diff --git a/lambdas/prospect-translation-lambda/src/events/snowplow.ts b/lambdas/prospect-translation-lambda/src/events/snowplow.ts index 661a3b4c..a90d9454 100644 --- a/lambdas/prospect-translation-lambda/src/events/snowplow.ts +++ b/lambdas/prospect-translation-lambda/src/events/snowplow.ts @@ -54,14 +54,13 @@ export const generateContext = ( */ export const generateSnowplowEntity = ( prospect: Prospect, - prospectSource: string, runDetails: ProspectRunDetails, features: ProspectFeatures, ): SnowplowProspect => { return { object_version: 'new', prospect_id: prospect.prospectId, - prospect_source: prospectSource, + prospect_source: prospect.prospectType, scheduled_surface_id: prospect.scheduledSurfaceGuid, url: prospect.url, title: prospect.title, diff --git a/lambdas/prospect-translation-lambda/src/index.integration.ts b/lambdas/prospect-translation-lambda/src/index.integration.ts index 99731b50..2364b738 100644 --- a/lambdas/prospect-translation-lambda/src/index.integration.ts +++ b/lambdas/prospect-translation-lambda/src/index.integration.ts @@ -1,5 +1,6 @@ import { setupServer } from 'msw/node'; import { Callback, Context } from 'aws-lambda'; +import * as Sentry from '@sentry/serverless'; import config from './config'; @@ -19,6 +20,8 @@ import { */ describe('prospect api translation lambda entry function', () => { const server = setupServer(); + const captureConsoleSpy = jest.spyOn(console, 'log').mockImplementation(); + const sentrySpy = jest.spyOn(Sentry, 'captureException').mockImplementation(); beforeAll(() => server.listen({ onUnhandledRequest: 'bypass' })); @@ -28,12 +31,14 @@ describe('prospect api translation lambda entry function', () => { }); afterEach(() => { - // restoreAllMocks restores all mocks and replaced properties. clearAllMocks only clears mocks. - jest.restoreAllMocks(); + // clear all mock history + jest.clearAllMocks(); server.resetHandlers(); }); afterAll(() => { + // restore all mocks and replaced properties/methods + jest.restoreAllMocks(); server.close(); }); @@ -81,6 +86,7 @@ describe('prospect api translation lambda entry function', () => { // Check that the right Snowplow entity was included with the event. const goodEvents = await getGoodSnowplowEvents(); + for (let i = 0; i < 2; i++) { const snowplowContext = parseSnowplowData( goodEvents[i].rawEvent.parameters.cx, @@ -119,8 +125,6 @@ describe('prospect api translation lambda entry function', () => { ], }; - const captureConsoleSpy = jest.spyOn(console, 'log').mockImplementation(); - await processor(fakePayload, sqsContext, sqsCallback); expect(captureConsoleSpy).toHaveBeenCalledWith( @@ -152,8 +156,6 @@ describe('prospect api translation lambda entry function', () => { ], }; - const captureConsoleSpy = jest.spyOn(console, 'log').mockImplementation(); - await processor(fakePayload, sqsContext, sqsCallback); expect(captureConsoleSpy).toHaveBeenCalledWith( @@ -162,4 +164,78 @@ describe('prospect api translation lambda entry function', () => { expect(captureConsoleSpy).toHaveBeenCalledWith(`1 prospects had errors.`); }); + + it('accepts prospects with ML-supplied URL metadata', async () => { + const fakePayload = { + Records: [ + { + messageId: '1', + receiptHandle: 'handle', + attributes: { + ApproximateReceiveCount: '1', + SentTimestamp: 'time', + SenderId: 'sender id', + ApproximateFirstReceiveTimestamp: 'time', + }, + messageAttributes: {}, + md5OfMessageAttributes: null, + md5OfBody: 'ab6181399b03008ffaada54b68c77574', + eventSource: 'aws:sqs', + eventSourceARN: + 'arn:aws:sqs:us-east-1:996905175585:ProspectAPI-Prod-Sqs-Translation-Queue', + awsRegion: 'us-east-1', + body: '{"version":"0","id":"ab02d85b-4cb6-9de9-b549-b572166b278f","detail-type":"prospect-generation","source":"prospect-events","account":"996905175585","time":"2024-04-16T00:05:59Z","region":"us-east-1","resources":[],"detail":{"id":"c71504d1-f14f-4181-a654-730d5855ec48","version":3,"candidates":[{"scheduled_surface_guid": "NEW_TAB_DE_DE", "prospect_id": "447c90d2-1084-5f83-a585-26edfbf5640e", "url": "https://www.spektrum.de/news/ninetyeast-ridge-eines-der-laengsten-gebirge-liegt-tief-unter-dem-meer/2246024", "prospect_source": "QA_SPORTS", "save_count": 0, "predicted_topic": "", "rank": 117, "data_source": "prospect", "title": "Eines der l\u00e4ngsten Gebirge liegt tief unter dem Meer", "excerpt": "Die Bergkette wurde durch ein seltenes vulkanisches Ph\u00e4nomen gebildet", "language": "EN", "image_url": "https://static.spektrum.de/fm/912/f1920x1080/triplejunction_gis_2014_lrg.8200272.png", "authors": ["Daniel Lingenh\u00f6hl"]},{"scheduled_surface_guid": "NEW_TAB_DE_DE", "prospect_id": "faa0631f-3a43-555a-9947-7ce3de165a92", "url": "https://www.spiegel.de/ausland/wahlen-in-thailand-wie-eine-kandidatin-fuer-ein-demokratischeres-thailand-kaempft-a-1fa52682-ca7a-45f9-816a-a031ca0a7950", "prospect_source": "QA_SPORTS", "save_count": 10, "predicted_topic": "POLITICS", "rank": 118, "data_source": "prospect", "title": "Parlamentswahlen in Thailand Frau Thamnitinans Kampf gegen die Putschisten", "excerpt": "Sie vertritt als Anwältin Regimegegner. Nun will Sasinan Thamnitinan sich im Parlament für mehr Demokratie in Thailand einsetzen. Unterwegs im ersten Wahlkampf nach den landesweiten Massenprotesten gegen das Militär.", "language": "DE", "image_url": "https://cdn.prod.www.spiegel.de/images/06720642-d6e0-42ed-a1e1-1ded9f79d1bd_w1280_r1.77_fpx68_fpy93.jpg", "authors": ["Maria Stöhr, DER SPIEGEL"]}]}}', + }, + ], + }; + + await processor(fakePayload, sqsContext, sqsCallback); + + expect(sentrySpy).toHaveBeenCalledTimes(0); + + expect(captureConsoleSpy).toHaveBeenCalledWith( + `2 prospects inserted into dynamo.`, + ); + }); + + it('should send errors to Sentry when ML-supplied URL metadata is invalid', async () => { + // this payload has three errors in the second/last prospect: + // - missing image_url + // - number given for authors + // - invalid language code + const fakePayload = { + Records: [ + { + messageId: '1', + receiptHandle: 'handle', + attributes: { + ApproximateReceiveCount: '1', + SentTimestamp: 'time', + SenderId: 'sender id', + ApproximateFirstReceiveTimestamp: 'time', + }, + messageAttributes: {}, + md5OfMessageAttributes: null, + md5OfBody: 'ab6181399b03008ffaada54b68c77574', + eventSource: 'aws:sqs', + eventSourceARN: + 'arn:aws:sqs:us-east-1:996905175585:ProspectAPI-Prod-Sqs-Translation-Queue', + awsRegion: 'us-east-1', + body: '{"version":"0","id":"ab02d85b-4cb6-9de9-b549-b572166b278f","detail-type":"prospect-generation","source":"prospect-events","account":"996905175585","time":"2024-04-16T00:05:59Z","region":"us-east-1","resources":[],"detail":{"id":"c71504d1-f14f-4181-a654-730d5855ec48","version":3,"candidates":[{"scheduled_surface_guid": "NEW_TAB_DE_DE", "prospect_id": "447c90d2-1084-5f83-a585-26edfbf5640e", "url": "https://www.spektrum.de/news/ninetyeast-ridge-eines-der-laengsten-gebirge-liegt-tief-unter-dem-meer/2246024", "prospect_source": "QA_ENTERTAINMENT", "save_count": 0, "predicted_topic": "", "rank": 117, "data_source": "prospect", "title": "Eines der l\u00e4ngsten Gebirge liegt tief unter dem Meer", "excerpt": "Die Bergkette wurde durch ein seltenes vulkanisches Ph\u00e4nomen gebildet", "language": "EN", "image_url": "https://static.spektrum.de/fm/912/f1920x1080/triplejunction_gis_2014_lrg.8200272.png", "authors": ["Daniel Lingenh\u00f6hl"]},{"scheduled_surface_guid": "NEW_TAB_DE_DE", "prospect_id": "faa0631f-3a43-555a-9947-7ce3de165a92", "url": "https://www.spiegel.de/ausland/wahlen-in-thailand-wie-eine-kandidatin-fuer-ein-demokratischeres-thailand-kaempft-a-1fa52682-ca7a-45f9-816a-a031ca0a7950", "prospect_source": "QA_ENTERTAINMENT", "save_count": 10, "predicted_topic": "POLITICS", "rank": 118, "data_source": "prospect", "title": 16, "excerpt": "Sie vertritt als Anwältin Regimegegner. Nun will Sasinan Thamnitinan sich im Parlament für mehr Demokratie in Thailand einsetzen. Unterwegs im ersten Wahlkampf nach den landesweiten Massenprotesten gegen das Militär.", "language": "BB", "authors": 42}]}}', + }, + ], + }; + + await processor(fakePayload, sqsContext, sqsCallback); + + // authors, image_url, and language are invalid in the test payload above + // - all should have triggered a Sentry call + expect(sentrySpy).toHaveBeenCalledTimes(3); + + // the errors in ML-supplied URL metadata should *not* stop the prospects + // from being inserted! + expect(captureConsoleSpy).toHaveBeenCalledWith( + `2 prospects inserted into dynamo.`, + ); + }); }); diff --git a/lambdas/prospect-translation-lambda/src/index.ts b/lambdas/prospect-translation-lambda/src/index.ts index 07eb08eb..d2c4b09e 100644 --- a/lambdas/prospect-translation-lambda/src/index.ts +++ b/lambdas/prospect-translation-lambda/src/index.ts @@ -50,6 +50,7 @@ export const processor: SQSHandler = async (event: SQSEvent): Promise => { console.log('raw event:'); console.log(event); + let prospectIdProcessed: string; let prospectIdsProcessed: string[] = []; // make sure the event payload is JSON-parseable and of SQS shape @@ -110,14 +111,18 @@ export const processor: SQSHandler = async (event: SQSEvent): Promise => { // now get the metadata and put it into dynamo // this function will send an exception to sentry if any part of it // fails. - prospectIdsProcessed = await processProspect( - prospect, - prospectIdsProcessed, - rawSqsProspect.prospect_source, - runDetails, - features, - tracker, - ); + await processProspect(prospect, runDetails, features, tracker); + + // an edge case we've hit before - ML was sending duplicate prospects in a + // single batch. we don't need to error here - dynamo will silently replace + // the existing entry. logging seems like the best approach for now. + if (prospectIdsProcessed.includes(prospect.id)) { + console.log( + `${prospect.id} is a duplicate in this ${prospect.scheduledSurfaceGuid} / ${prospect.prospectType} batch!`, + ); + } + + prospectIdsProcessed.push(prospect.id); } } } diff --git a/lambdas/prospect-translation-lambda/src/lib.spec.ts b/lambdas/prospect-translation-lambda/src/lib.spec.ts index f9ab96e3..9083a0b3 100644 --- a/lambdas/prospect-translation-lambda/src/lib.spec.ts +++ b/lambdas/prospect-translation-lambda/src/lib.spec.ts @@ -24,6 +24,7 @@ import { validateProperties, validateStructure, } from './lib'; +import { SqsProspect, ProspectTypesWithMlUrlMetadata } from './types'; describe('lib', () => { const captureExceptionSpy = jest @@ -61,7 +62,7 @@ describe('lib', () => { title: 'Test-Title', publisher: 'test-publisher', isCollection: false, - isSyndicated: true, + isSyndicated: false, authors: 'questlove,rafael frumkin', }; @@ -317,6 +318,120 @@ describe('lib', () => { expect(result.topic).toEqual(expected.topic); expect(result.saveCount).toEqual(expected.saveCount); }); + + it('should create a prospect from an SqsProspect with ML-supplied metadata', () => { + const validSqsProspectWithMlData: SqsProspect = { + ...validSqsProspect, + // pick any prospect type that has ML-supplied metadata + prospect_source: ProspectTypesWithMlUrlMetadata[0], + authors: ['Daniel Lingenh\u00f6hl'], + excerpt: + 'Die Bergkette wurde durch ein seltenes vulkanisches Ph\u00e4nomen gebildet', + image_url: + 'https://static.spektrum.de/fm/912/f1920x1080/triplejunction_gis_2014_lrg.8200272.png', + language: 'EN', + title: 'Eines der l\u00e4ngsten Gebirge liegt tief unter dem Meer', + }; + + const expected: Prospect = { + id: 'c3h5n3o9', + prospectId: validSqsProspectWithMlData.prospect_id, + scheduledSurfaceGuid: validSqsProspectWithMlData.scheduled_surface_guid, + url: validSqsProspectWithMlData.url, + prospectType: ProspectType[validSqsProspectWithMlData.prospect_source], + topic: Topics[validSqsProspectWithMlData.predicted_topic], + saveCount: validSqsProspectWithMlData.save_count, + rank: validSqsProspectWithMlData.rank, + authors: 'Daniel Lingenh\u00f6hl', + excerpt: + 'Die Bergkette wurde durch ein seltenes vulkanisches Ph\u00e4nomen gebildet', + imageUrl: + 'https://static.spektrum.de/fm/912/f1920x1080/triplejunction_gis_2014_lrg.8200272.png', + language: 'EN', + title: 'Eines der l\u00e4ngsten Gebirge liegt tief unter dem Meer', + }; + + const result = convertSqsProspectToProspect(validSqsProspectWithMlData); + + expect(result.id).toBeDefined(); // we trust uuidV4 to work + expect(result.prospectId).toEqual(expected.prospectId); + expect(result.scheduledSurfaceGuid).toEqual( + expected.scheduledSurfaceGuid, + ); + expect(result.url).toEqual(expected.url); + expect(result.prospectType).toEqual(expected.prospectType); + expect(result.topic).toEqual(expected.topic); + expect(result.saveCount).toEqual(expected.saveCount); + expect(result.authors).toEqual(expected.authors); + expect(result.excerpt).toEqual(expected.excerpt); + expect(result.imageUrl).toEqual(expected.imageUrl); + expect(result.language).toEqual(expected.language); + expect(result.title).toEqual(expected.title); + }); + + it.only('should emit Sentry errors when ML-supplied URL metadata is invalid', () => { + const validSqsProspectWithMlData: any = { + ...validSqsProspect, + // pick any prospect type that has ML-supplied metadata + prospect_source: ProspectTypesWithMlUrlMetadata[0], + // invalid - should get thrown out + authors: 42, + excerpt: + 'Die Bergkette wurde durch ein seltenes vulkanisches Ph\u00e4nomen gebildet', + // invalid - should get thrown out + language: 'BB', + // should get converted to a string + title: 16, + }; + + const expected: Prospect = { + id: 'c3h5n3o9', + prospectId: validSqsProspectWithMlData.prospect_id, + scheduledSurfaceGuid: validSqsProspectWithMlData.scheduled_surface_guid, + url: validSqsProspectWithMlData.url, + prospectType: ProspectType[validSqsProspectWithMlData.prospect_source], + topic: Topics[validSqsProspectWithMlData.predicted_topic], + saveCount: validSqsProspectWithMlData.save_count, + rank: validSqsProspectWithMlData.rank, + excerpt: + 'Die Bergkette wurde durch ein seltenes vulkanisches Ph\u00e4nomen gebildet', + title: '16', + }; + + const result = convertSqsProspectToProspect(validSqsProspectWithMlData); + + expect(result.id).toBeDefined(); // we trust uuidV4 to work + expect(result.prospectId).toEqual(expected.prospectId); + expect(result.scheduledSurfaceGuid).toEqual( + expected.scheduledSurfaceGuid, + ); + expect(result.url).toEqual(expected.url); + expect(result.prospectType).toEqual(expected.prospectType); + expect(result.topic).toEqual(expected.topic); + expect(result.saveCount).toEqual(expected.saveCount); + expect(result.authors).toEqual(undefined); + expect(result.excerpt).toEqual(expected.excerpt); + expect(result.imageUrl).toEqual(undefined); + expect(result.language).toEqual(undefined); + expect(result.title).toEqual(expected.title); + + expect(captureExceptionSpy).toHaveBeenCalledTimes(3); + + expect(captureExceptionSpy).toHaveBeenNthCalledWith( + 1, + `Invalid ML supplied value for 'authors': 42`, + ); + + expect(captureExceptionSpy).toHaveBeenNthCalledWith( + 2, + `Invalid ML supplied value for 'image_url': undefined`, + ); + + expect(captureExceptionSpy).toHaveBeenNthCalledWith( + 3, + `Invalid ML supplied value for 'language': BB`, + ); + }); }); describe('hydrateProspectMetaData', () => { @@ -388,8 +503,8 @@ describe('lib', () => { language: undefined, title: undefined, publisher: undefined, - isCollection: undefined, - isSyndicated: undefined, + isCollection: false, + isSyndicated: false, authors: undefined, }; diff --git a/lambdas/prospect-translation-lambda/src/lib.ts b/lambdas/prospect-translation-lambda/src/lib.ts index 616af1e4..30c2263c 100644 --- a/lambdas/prospect-translation-lambda/src/lib.ts +++ b/lambdas/prospect-translation-lambda/src/lib.ts @@ -5,6 +5,7 @@ import { v4 as uuidv4 } from 'uuid'; import { dbClient, + deriveDomainName, deriveUrlMetadata, insertProspect, Prospect, @@ -22,7 +23,7 @@ import { UrlMetadata, } from 'content-common'; -import { SqsProspect } from './types'; +import { SqsProspect, ProspectTypesWithMlUrlMetadata } from './types'; import { generateSnowplowEntity, queueSnowplowEvent } from './events/snowplow'; import { Tracker } from '@snowplow/node-tracker'; @@ -128,38 +129,36 @@ export const getProspectRunDetailsFromMessageJson = ( } }; +/** + * retrieves URL metadata from the Parser to hydrate the prospect. + * inserts the prospect into dynamo and sends snowplow event. + * + * @param prospect a Prospect object with partially hydrated data + * @param runDetails the ML run details, sent to snowplow + * @param features the ML prospect features, sent to snowplow + * @param tracker the snowplow Tracker + * @returns Promise + */ export const processProspect = async ( prospect: Prospect, - idsProcessed: string[], - prospectSource: string, runDetails: ProspectRunDetails, features: ProspectFeatures, tracker: Tracker, -): Promise => { +): Promise => { + // get URL metadata from the Parser const urlMetadata = await deriveUrlMetadata(prospect.url); + // hydrate necessary URL metadata prospect = hydrateProspectMetadata(prospect, urlMetadata); + // insert the prospect into dynamodb await insertProspect(dbClient, prospect); - // an edge case we've hit before - ML was sending duplicate prospects in a - // single batch. we don't need to error here - dynamo will silently replace - // the existing entry. logging seems like the best approach for now. - if (idsProcessed.includes(prospect.id)) { - console.log( - `${prospect.id} is a duplicate in this ${prospect.scheduledSurfaceGuid} / ${prospect.prospectType} batch!`, - ); - } - - idsProcessed.push(prospect.id); - // Finally, Send a Snowplow event after the prospect got successfully created in dynamo. queueSnowplowEvent( tracker, - generateSnowplowEntity(prospect, prospectSource, runDetails, features), + generateSnowplowEntity(prospect, runDetails, features), ); - - return idsProcessed; }; /** @@ -363,10 +362,17 @@ export const validateProperties = (sqsProspect: SqsProspect): boolean => { } }; +/** + * takes a raw prospect from SQS and converts it to a Prospect object as + * expecte by DynamoDB + * + * @param sqsProspect raw prospect object from SQS + * @returns Prospect object + */ export const convertSqsProspectToProspect = ( sqsProspect: SqsProspect, ): Prospect => { - return { + let prospect: Prospect = { id: uuidv4(), prospectId: sqsProspect.prospect_id, // make sure this matches our ALL CAPS guid value @@ -377,6 +383,59 @@ export const convertSqsProspectToProspect = ( saveCount: sqsProspect.save_count, rank: sqsProspect.rank, }; + + // 2024-12-12 + // some prospects will have ML-supplied URL metadata. this is currently + // experimental to validate metadata from ML, so we want to capture any + // issues in Sentry, but not block processing to ensure editors see the + // missing pieces of data. + if (ProspectTypesWithMlUrlMetadata.includes(prospect.prospectType)) { + try { + prospect.authors = sqsProspect.authors.join(','); + } catch { + Sentry.captureException( + `Invalid ML supplied value for 'authors': ${sqsProspect.authors}`, + ); + } + + try { + prospect.excerpt = sqsProspect.excerpt.toString(); + } catch { + Sentry.captureException( + `Invalid ML supplied value for 'excerpt': ${sqsProspect.excerpt}`, + ); + } + + try { + prospect.imageUrl = sqsProspect.image_url.toString(); + } catch { + Sentry.captureException( + `Invalid ML supplied value for 'image_url': ${sqsProspect.image_url}`, + ); + } + + try { + prospect.title = sqsProspect.title.toString(); + } catch { + Sentry.captureException( + `Invalid ML supplied value for 'title': ${sqsProspect.title}`, + ); + } + + // language must map to our enum - if it doesn't, skip setting this property + if ( + sqsProspect.language && + sqsProspect.language.toUpperCase() in CorpusLanguage + ) { + prospect.language = sqsProspect.language.toUpperCase(); + } else { + Sentry.captureException( + `Invalid ML supplied value for 'language': ${sqsProspect.language}`, + ); + } + } + + return prospect; }; /** @@ -390,38 +449,43 @@ export const hydrateProspectMetadata = ( prospect: Prospect, urlMetadata: UrlMetadata, ): Prospect => { - // title and excerpt have different formatting based on prospect language - let title: string; - let excerpt: string; - - if (urlMetadata.language?.toUpperCase() === CorpusLanguage.EN) { - title = formatQuotesEN(applyApTitleCase(urlMetadata.title)) as string; - excerpt = formatQuotesEN(urlMetadata.excerpt) as string; - } else if (urlMetadata.language?.toUpperCase() === CorpusLanguage.DE) { - title = formatQuotesDashesDE(urlMetadata.title) as string; - excerpt = formatQuotesDashesDE(urlMetadata.excerpt) as string; - } else { - title = urlMetadata.title; - excerpt = urlMetadata.excerpt; - } + // while we are moving towards ML-supplied metadata, the Parser must still + // give us the publisher name. + // (from the legacy MySQL `readitla_b.domain_business_metadata` table) + prospect.publisher = urlMetadata.publisher; - // Mutating the function argument here to avoid creating - // more objects and be memory efficient + // ML is no longer sending syndicated/collections as prospects + prospect.isCollection = false; + prospect.isSyndicated = false; - // While the the urlMetaData and prospect fields match currently, - // they're not guaranteed to be the same in the future hence we're - // directly assigning them + if (ProspectTypesWithMlUrlMetadata.includes(prospect.prospectType)) { + // URL metadata was supplied by ML and assigned in + // `convertSqsProspectToProspect` above. we specifically do *not* want to + // fall-back to Parser metadata in this scenario, as we want to know if + // any data is missing from ML. - // NOTE: individual url metadata fields might be undefined - prospect.domain = urlMetadata.domain; - prospect.excerpt = excerpt; - prospect.imageUrl = urlMetadata.imageUrl; - prospect.isCollection = urlMetadata.isCollection; - prospect.isSyndicated = urlMetadata.isSyndicated; - prospect.language = urlMetadata.language; - prospect.publisher = urlMetadata.publisher; - prospect.title = title; - prospect.authors = urlMetadata.authors; + // `deriveDomainName` is the same method used under the hood in + // `deriveUrlMetadata` - calling directly here to clarify no Parser use. + prospect.domain = deriveDomainName(prospect.url); + } else { + // URL metadata *not* supplied by ML, so use the Parser + // NOTE: urlMetadata fields might be undefined/empty + prospect.authors = urlMetadata.authors; + prospect.domain = urlMetadata.domain; + prospect.excerpt = urlMetadata.excerpt; + prospect.imageUrl = urlMetadata.imageUrl; + prospect.language = urlMetadata.language; + prospect.title = urlMetadata.title; + } + + // apply title/excerpt formatting for EN & DE + if (prospect.language?.toUpperCase() === CorpusLanguage.EN) { + prospect.title = formatQuotesEN(applyApTitleCase(prospect.title)) as string; + prospect.excerpt = formatQuotesEN(prospect.excerpt) as string; + } else if (prospect.language?.toUpperCase() === CorpusLanguage.DE) { + prospect.title = formatQuotesDashesDE(prospect.title) as string; + prospect.excerpt = formatQuotesDashesDE(prospect.excerpt) as string; + } return prospect; }; diff --git a/lambdas/prospect-translation-lambda/src/types.ts b/lambdas/prospect-translation-lambda/src/types.ts index 7df3a360..9b7d0f69 100644 --- a/lambdas/prospect-translation-lambda/src/types.ts +++ b/lambdas/prospect-translation-lambda/src/types.ts @@ -1,11 +1,34 @@ +import { CorpusLanguage, ProspectType } from 'content-common'; + // this is the raw data from metaflow/sqs export interface SqsProspect { - prospect_id: string; - scheduled_surface_guid: string; + data_source?: string; predicted_topic: string; + prospect_id: string; prospect_source: string; - data_source?: string; - url: string; - save_count: number; rank: number; + save_count: number; + scheduled_surface_guid: string; + url: string; + // 2024-12-12 + // some ML prospects will contain URL metadata to be used instead of Parser + // metadata. this is currently experimental, so all properties below are + // optional. + authors?: string[]; + excerpt?: string; + image_url?: string; + language?: CorpusLanguage; + title?: string; } + +// 2024-12-12 +// noting which prospect types will have ML-supplied URL metadata. this is +// likely a temporary array, as we should move to a consistent URL metadata +// source for all prospect types. +export const ProspectTypesWithMlUrlMetadata: ProspectType[] = [ + ProspectType.QA_ENTERTAINMENT, + ProspectType.QA_GAMING, + ProspectType.QA_HISTORY, + ProspectType.QA_RELATIONSHIPS, + ProspectType.QA_SPORTS, +]; diff --git a/packages/prospectapi-common/src/index.ts b/packages/prospectapi-common/src/index.ts index 3d90d3f9..6b99b244 100644 --- a/packages/prospectapi-common/src/index.ts +++ b/packages/prospectapi-common/src/index.ts @@ -4,7 +4,7 @@ export { DynamoItem, GetProspectsFilters, Prospect } from './types'; export { ScheduledSurfaces, ScheduledSurface } from 'content-common'; -export { toUnixTimestamp, deriveUrlMetadata } from './lib'; +export { toUnixTimestamp, deriveDomainName, deriveUrlMetadata } from './lib'; export { scanAllRows, generateInsertParams,