Skip to content

Commit

Permalink
Merge pull request #16 from georgegebbett/yoast-json
Browse files Browse the repository at this point in the history
Yoast json scraping added
  • Loading branch information
georgegebbett authored May 14, 2022
2 parents d6537de + 44f237c commit b8658c6
Show file tree
Hide file tree
Showing 3 changed files with 488 additions and 96 deletions.
133 changes: 45 additions & 88 deletions api/src/recipes/scraper/recipeScraper.spec.ts
Original file line number Diff line number Diff line change
@@ -1,96 +1,23 @@
import { RecipeScraper } from './recipeScraper';
import { Test, TestingModule } from '@nestjs/testing';
import { JSDOM } from 'jsdom';
import { Recipe } from '../schemas/recipe.schema';
import {
allRecipesDomString,
allRecipesMetadataObject,
allRecipesRecipe,
allRecipesUrl,
belliniMetadataObject,
belliniUrl,
mockNodeList,
mockRecipe,
yoastDomString,
yoastMetadataObject,
yoastRecipe,
yoastUrl,
} from './recipeScraperTestConstants';

describe('RecipeScraper', () => {
let scraper: RecipeScraper;

const belliniUrl = 'https://www.bbcgoodfood.com/recipes/bellini';

const mockNodeList = (
domString = '<script type="application/ld+json">{"@context":"https://schema.org","@id":"https://www.bbcgoodfood.com/recipes/bellini#Recipe","@type":"Recipe","description":"A classy cocktail served in an elegant flute - this simple combination of peach purée and Prosecco makes a great start to any celebration","image":{"@type":"ImageObject","height":400,"url":"https://images.immediate.co.uk/production/volatile/sites/30/2020/08/bellini-b049342.jpg","width":440},"mainEntityOfPage":{"@type":"WebPage","@id":"https://www.bbcgoodfood.com/recipes/bellini"},"name":"Bellini","url":"https://www.bbcgoodfood.com/recipes/bellini","author":{"@type":"Person","name":"Good Food team"},"dateModified":"2020-08-08T02:26:26+00:00","datePublished":"2013-11-18T16:37:44+00:00","headline":"Bellini","keywords":"Christmas, Christmas morning, cocktails canapes, Good Food, Party, sparkling cocktail","publisher":{"@type":"Organization","name":"BBC Good Food","url":"https://www.bbcgoodfood.com","logo":{"@type":"ImageObject","url":"https://images.immediate.co.uk/production/volatile/sites/30/2019/07/GoodFood-dark-516d417.png","width":221,"height":58}},"nutrition":{"@type":"NutritionInformation","calories":"143 calories","carbohydrateContent":"18 grams carbohydrates","sugarContent":"18 grams sugar","fiberContent":"0.7 grams fiber","proteinContent":"0.7 grams protein"},"prepTime":"PT5M","recipeCategory":"Cocktails","recipeIngredient":["500ml peach purée or peach nectar","1 bottle prosecco"],"recipeInstructions":[{"@type":"HowToStep","text":"Put the peach puree in a Champagne flute up to about 1/3 full and slowly top up with Prosecco."}],"recipeYield":6,"totalTime":"PT5M"}</script><script type="application/ld+json">{"@context":"https://schema.org/","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","position":1,"name":"Home","item":"https://www.bbcgoodfood.com/"},{"@type":"ListItem","position":2,"name":"Recipes","item":"https://www.bbcgoodfood.com/recipes"},{"@type":"ListItem","position":3,"name":"Bellini"}]}</script>{"@context":"https://schema.org/","@id":"https://www.bbcgoodfood.com/recipes/bellini#Recipe","aggregateRating":{"@type":"AggregateRating","ratingValue":4,"reviewCount":2,"bestRating":5,"worstRating":1}}',
) => {
return JSDOM.fragment(domString).querySelectorAll(
"script[type='application/ld+json']",
);
};

const belliniMetadataObject = {
'@context': 'https://schema.org',
'@id': 'https://www.bbcgoodfood.com/recipes/bellini#Recipe',
'@type': 'Recipe',
description:
'A classy cocktail served in an elegant flute - this simple combination of peach purée and Prosecco makes a great start to any celebration',
image: {
'@type': 'ImageObject',
height: 400,
url: 'https://images.immediate.co.uk/production/volatile/sites/30/2020/08/bellini-b049342.jpg',
width: 440,
},
mainEntityOfPage: {
'@type': 'WebPage',
'@id': 'https://www.bbcgoodfood.com/recipes/bellini',
},
name: 'Bellini',
url: 'https://www.bbcgoodfood.com/recipes/bellini',
author: { '@type': 'Person', name: 'Good Food team' },
dateModified: '2020-08-08T02:26:26+00:00',
datePublished: '2013-11-18T16:37:44+00:00',
headline: 'Bellini',
keywords:
'Christmas, Christmas morning, cocktails canapes, Good Food, Party, sparkling cocktail',
publisher: {
'@type': 'Organization',
name: 'BBC Good Food',
url: 'https://www.bbcgoodfood.com',
logo: {
'@type': 'ImageObject',
url: 'https://images.immediate.co.uk/production/volatile/sites/30/2019/07/GoodFood-dark-516d417.png',
width: 221,
height: 58,
},
},
nutrition: {
'@type': 'NutritionInformation',
calories: '143 calories',
carbohydrateContent: '18 grams carbohydrates',
sugarContent: '18 grams sugar',
fiberContent: '0.7 grams fiber',
proteinContent: '0.7 grams protein',
},
prepTime: 'PT5M',
recipeCategory: 'Cocktails',
recipeIngredient: [
'500ml peach purée or peach nectar',
'1 bottle prosecco',
],
recipeInstructions: [
{
'@type': 'HowToStep',
text: 'Put the peach puree in a Champagne flute up to about 1/3 full and slowly top up with Prosecco.',
},
],
recipeYield: 6,
totalTime: 'PT5M',
};

const mockRecipe = (
url = 'https://www.bbcgoodfood.com/recipes/bellini',
name = 'Bellini',
imageUrl = 'https://images.immediate.co.uk/production/volatile/sites/30/2020/08/bellini-b049342.jpg',
ingredients = ['500ml peach purée or peach nectar', '1 bottle prosecco'],
steps = [
'Put the peach puree in a Champagne flute up to about 1/3 full and slowly top up with Prosecco.',
],
): Recipe => ({
url,
name,
imageUrl,
ingredients,
steps,
});

beforeEach(async () => {
const module: TestingModule = await Test.createTestingModule({
providers: [RecipeScraper],
Expand Down Expand Up @@ -118,6 +45,18 @@ describe('RecipeScraper', () => {
);
});

it('should extract the Recipe object from a NodeList when the metadata is an array of objects', function () {
expect(
scraper.getSchemaRecipeFromNodeList(mockNodeList(allRecipesDomString)),
).toEqual(allRecipesMetadataObject);
});

it('should extract a Recipe object from a NodeList where the metadata is a graph', function () {
expect(
scraper.getSchemaRecipeFromNodeList(mockNodeList(yoastDomString)),
).toEqual(yoastMetadataObject);
});

it('should correctly parse a recipe name', function () {
expect(scraper.parseRecipeName(belliniMetadataObject.name)).toEqual(
'Bellini',
Expand Down Expand Up @@ -149,14 +88,32 @@ describe('RecipeScraper', () => {
]);
});

it('should be able to return a recipe object from a url', async function () {
it('should be able to return a recipe object from a url where the page metadata contains a single recipe object', async function () {
jest
.spyOn(scraper, 'getNodeListOfMetadataNodesFromUrl')
.mockResolvedValueOnce(mockNodeList());

expect(await scraper.hydrateRecipe(belliniUrl)).toEqual(mockRecipe());
});

it('should be able to return a recipe object from a url where the page metadata contains an array of objects and one is a Recipe', async function () {
jest
.spyOn(scraper, 'getNodeListOfMetadataNodesFromUrl')
.mockResolvedValueOnce(mockNodeList(allRecipesDomString));

expect(await scraper.hydrateRecipe(allRecipesUrl)).toEqual(
allRecipesRecipe,
);
});

it('should be able to return a recipe object from a url where the page metadata contains a graph containing a recipe object', async function () {
jest
.spyOn(scraper, 'getNodeListOfMetadataNodesFromUrl')
.mockResolvedValueOnce(mockNodeList(yoastDomString));

expect(await scraper.hydrateRecipe(yoastUrl)).toEqual(yoastRecipe);
});

afterEach(() => {
jest.clearAllMocks();
});
Expand Down
27 changes: 19 additions & 8 deletions api/src/recipes/scraper/recipeScraper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,25 +51,36 @@ export class RecipeScraper {

if (Array.isArray(parsedNodeContent)) {
for (const metadataObject of parsedNodeContent) {
if (
metadataObject.hasOwnProperty('@type') &&
/recipe/i.test(metadataObject['@type'])
) {
if (this.jsonObjectIsRecipe(metadataObject)) {
return metadataObject;
}
}
} else {
if (
parsedNodeContent.hasOwnProperty('@type') &&
/recipe/i.test(parsedNodeContent['@type'])
) {
if (this.jsonObjectIsRecipe(parsedNodeContent)) {
return parsedNodeContent;
}
if (this.jsonObjectHasGraph(parsedNodeContent)) {
for (const graphNode of parsedNodeContent['@graph']) {
if (this.jsonObjectIsRecipe(graphNode)) {
return graphNode;
}
}
}
}
}
throw new Error('Unable to extract Recipe metadata from provided url');
}

jsonObjectIsRecipe(jsonObject: object): boolean {
return (
jsonObject.hasOwnProperty('@type') && /recipe/i.test(jsonObject['@type'])
);
}

jsonObjectHasGraph(jsonObject: object): boolean {
return jsonObject.hasOwnProperty('@graph');
}

async hydrateRecipe(url: string) {
try {
const nodeList: NodeList = await this.getNodeListOfMetadataNodesFromUrl(
Expand Down
Loading

0 comments on commit b8658c6

Please sign in to comment.