From 72861a04a956af5aa421f28457a46223589068cc Mon Sep 17 00:00:00 2001 From: "Micah D. Cochran" Date: Mon, 25 Sep 2023 11:19:38 -0500 Subject: [PATCH 1/2] rm reppy, improve typing, sup. Crawl-delay, more --- .gitignore | 1 + README.md | 6 +- recipe_crawler.py | 141 +++++++++++++++++++++----------- requirements-dev.txt | 4 + website_source-open_source.yaml | 9 +- website_sources.yaml | 7 ++ 6 files changed, 117 insertions(+), 51 deletions(-) create mode 100644 requirements-dev.txt diff --git a/.gitignore b/.gitignore index 6e48a25..f445ea5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ __pycache__/ donotinclude/ +.venv/ *.json *.log licenses*.md \ No newline at end of file diff --git a/README.md b/README.md index c91b003..e6cb88c 100644 --- a/README.md +++ b/README.md @@ -81,10 +81,14 @@ OR Install the python libraries individually: * [Pendulum](https://pendulum.eustace.io/) * [PyYAML](https://pyyaml.org/) * [requests](https://docs.python-requests.org/) -* [reppy](https://github.com/seomoz/reppy) * [recipe-scrapers](https://github.com/hhursev/recipe-scrapers) * [scrape-schema-recipe](https://github.com/micahcochran/scrape-schema-recipe) + +For development, install the dependent libaries typing: +``` +> pip install -r requirements-dev.txt +``` ### Optional Dependency If you want to download pages using [Brotli compression](https://en.wikipedia.org/wiki/Brotli) * ensure requests >= 2.26.0 is installed, in order to check your request version (two ways) diff --git a/recipe_crawler.py b/recipe_crawler.py index 4d0252f..4fc7c11 100755 --- a/recipe_crawler.py +++ b/recipe_crawler.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -# Copyright 2021 Micah Cochran +# Copyright 2021-2023 Micah Cochran # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -49,7 +49,7 @@ # * Sitemap support. This might help some of the crawlers that exhaust all their links. # # * Some websites almost instantly exit out. I'm not too sure why, but I'm sure other web crawlers have encountered this. -# Some of it may be due to using non-standard forms for URLs. Prehaps adding a sitemap mode might be a way to work around this. +# Some of it may be due to using non-standard forms for URLs. Perhaps adding a sitemap mode might be a way to work around this. # # * There could be more logic to detecting if recipes are the same by the URL, but this would require some pattern matching code in the URL. # Logging the matching recipes would aid in figuring that out. @@ -75,19 +75,19 @@ # ----- Python native imports ----- import argparse from datetime import datetime, timedelta -from io import StringIO from itertools import cycle import json from random import randint import os.path import sys -from time import sleep +import time import urllib.parse +import urllib.robotparser # Use this typing syntax. # Python 3.9 allows builtins (dict, list) to be used # In Python 3.7 and 3.8, use `from __future__ import annotations` -from typing import Dict, List, Tuple, Union +from typing import Any, Dict, List, Tuple, Union # ----- external imports ----- @@ -97,8 +97,8 @@ import pendulum import recipe_scrapers from recipe_scrapers.settings import settings as recipe_scrapers_settings +from recipe_scrapers._abstract import AbstractScraper from recipe_scrapers._exceptions import ElementNotFoundInHtml -import reppy import requests import scrape_schema_recipe import yaml @@ -107,18 +107,13 @@ # isn't performing correctly, when set to True. False turns this off. SLOW_DEBUG = True -__version__ = "0.3.0" +__version__ = "0.4.0" # This is the user-agent USER_AGENT_ROBOTS = "RecipeCrawlerPY" USER_AGENT = f"RecipeCrawlerPY/{__version__}" REQUESTS_HEADERS = {"user-agent": USER_AGENT} -# put it in test mode for RecipeScrapersCrawler -if recipe_scrapers_settings.TEST_MODE is not False: - raise RuntimeError("TEST_MODE should be False.") -recipe_scrapers_settings.TEST_MODE = True - ###### class MultiCrawler ############################################## class MultiCrawler: @@ -139,7 +134,7 @@ def add_crawler( self, url: str, recipe_url: str = "", - license=None, + license: Union[str, None] = None, start_url: str = "", site_title: str = "", ) -> None: @@ -171,8 +166,19 @@ def run(self) -> None: # loop until the recipe limit is reached or there are no more crawlers left while self.num_recipes < self.recipe_limit or len(self.crawlers) < 1: crawler = next(self.crawler_iter) + + # if True, delay crawling this URL due to Crawl-delay in robots.txt + if crawler.delay_crawl(): + if SLOW_DEBUG is True: + time.sleep(1) + continue + try: - self.num_recipes += crawler.crawl() + try: + self.num_recipes += crawler.crawl() + except requests.exceptions.ReadTimeout as e: + logger.info(f"Caught ReadTimeout exception: {e}") + continue except AnchorListsEmptyError as e: logger.info(f"E is: {e}") logger.info(f"Terminating crawler for {crawler.base_url}") @@ -182,8 +188,9 @@ def run(self) -> None: if SLOW_DEBUG is True: logger.debug(f"self.num_recipes = {self.num_recipes}") # Sleeps for 1 second, allow for the user to watch for the crawler messing up. - # This somewhat "throttles" how much bandwidth. Would need more code to actually implement throttling. - sleep(1) + # This somewhat "throttles" how much bandwidth is needed. + # Would need more code to actually implement throttling. + time.sleep(1) def results_dict(self) -> List[Dict]: """resulting singular list of dictionaries that represent schema-recipes""" @@ -280,12 +287,24 @@ def __init__( robots_txt_url = urllib.parse.urljoin(url, "robots.txt") - try: - robots = reppy.robots.Robots.fetch(robots_txt_url) - except reppy.exceptions.ConnectionExceptions as e: - raise RecipeCrawlerNetworkError(e) - self.agent = robots.agent("*") - logger.debug(f"Reading robots.txt at: {robots_txt_url}") + self.robots_txt = urllib.robotparser.RobotFileParser(robots_txt_url) + self.robots_txt.read() + + self.crawl_delay = False + + if self.robots_txt.crawl_delay(USER_AGENT_ROBOTS) is not None: + self.crawl_delay = True + self.crawl_delay_seconds: int = int( + self.robots_txt.crawl_delay(USER_AGENT_ROBOTS) + ) + self.last_crawl = time.time() + + if self.robots_txt.request_rate(USER_AGENT_ROBOTS) is not None: + raise NotImplementedError( + "Software does not support Request-rate in robots.txt file." + ) + + logger.debug(f"Read robots.txt at: {robots_txt_url}") self.urltest = URLTest(url) @@ -316,7 +335,7 @@ def crawl(self) -> int: logger.debug(f"URL 1: {scrapings['url']}") logger.debug(f"URL 2: {self.recipe_json[similar_idx]['url']}") else: # for lists - raise NotImplemented( + raise NotImplementedError( "Recipes websites currently don't have multiple recipes on one webpage, not sure of the implications." ) num_recipes += len(scrapings) @@ -357,6 +376,7 @@ def _download_page(self) -> requests.models.Response: returns a requests.Response object """ # logger.debug("called _download_page()") + # TODO Why is this code being ignored? url = None while not url: # this picks a url @@ -378,6 +398,10 @@ def _download_page(self) -> requests.models.Response: self.been_there_urls.append(url) logger.debug(f"Visiting {url}") + # set last_crawl time to now + if self.crawl_delay: + self.last_crawl = time.time() + self.num_get_calls += 1 resp = requests.get(url, headers=REQUESTS_HEADERS, timeout=5) if resp.headers.get("Content-Length"): @@ -471,9 +495,7 @@ def _rank_url(self, url: str) -> Tuple[int, str]: return (-4, url) # Check if robots.txt rules allow going to this URL - # if self.robotparse.can_fetch(USER_AGENT_ROBOTS, url) is False: - # if self.robotparse.can_fetch("*", url) is False: - if self.agent.allowed(url) is False: + if self.robots_txt.can_fetch(USER_AGENT_ROBOTS, url) is False: return (-5, url) if self._recipe_url != "" and url.startswith(self._recipe_url): @@ -538,6 +560,14 @@ def license_report(self) -> str: return md + def delay_crawl(self) -> bool: + """Should this crawl be delayed due to a time delay?""" + + if self.crawl_delay is False: + return False + + return (time.time() - self.last_crawl) < self.crawl_delay_seconds + def load_website_sources_list( config_filename: str = "website_sources.yaml", @@ -573,11 +603,13 @@ def __init__( elif "healthyeating.nhlbi.nih.gov" in url: self.ScraperDriver = recipe_scrapers.NIHHealthyEating else: - raise NotImplemented(f"Website '{url}' has not been implemented.") + raise NotImplementedError(f"Website '{url}' has not been implemented.") super().__init__(url, recipe_url, license, start_url, site_title) - def _scrape_page(self, response: requests.models.Response) -> Dict: + def _scrape_page( + self, response: requests.models.Response + ) -> Union[Dict[str, Any], None]: """ scrapes a page input: response is a requests.models.Response object from requests.get() @@ -585,18 +617,23 @@ def _scrape_page(self, response: requests.models.Response) -> Dict: return dict or List, if it is empty it will be None """ logger.debug("called RecipeScrapersCrawler._scrape_page()") - with StringIO(response.text) as fp: - # NOTE does recipe_scraper_obj need to be kept? - recipe_scraper_obj = self.ScraperDriver(fp) - dict = self._convert_recipe_scraper_to_schema_dict( - recipe_scraper_obj, response.url - ) - return dict + # NOTE does recipe_scraper_obj need to be kept? + recipe_scraper_obj = self.ScraperDriver(url=response.url, html=response.text) + + recipe_dict = self._convert_recipe_scraper_to_schema_dict( + recipe_scraper_obj, response.url + ) + return recipe_dict # TODO: This function will need to be rewritten based on how exceptions work in recipe-scrapers versions after 13.3.0 - def _convert_recipe_scraper_to_schema_dict(self, rs, url: str) -> Union[Dict, None]: + def _convert_recipe_scraper_to_schema_dict( + self, + rs: AbstractScraper, + url: str, + ) -> Union[Dict[str, Any], None]: """Convert recipe-scraper object into a recipe schema dictionary""" + logger.debug("called _convert_recipe_scraper_to_schema_dict()") d = {} @@ -620,7 +657,7 @@ def _convert_recipe_scraper_to_schema_dict(self, rs, url: str) -> Union[Dict, No d["totalTime"] = isodate.duration_isoformat( timedelta(minutes=rs.total_time()) ) - except (NotImplemented, ElementNotFoundInHtml): + except (NotImplementedError, ElementNotFoundInHtml): pass try: @@ -651,7 +688,7 @@ def _convert_recipe_scraper_to_schema_dict(self, rs, url: str) -> Union[Dict, No class URLTest: """Class of tests for URLs""" - def __init__(self, baseurl: str = None): + def __init__(self, baseurl: Union[str, None] = None): """baseurl is the start URL of the website""" self.baseurl = baseurl self.basesplit = None @@ -735,6 +772,7 @@ def __str__(self) -> str: def main(sys_args: List = sys.argv[1:]) -> None: website_sources_list: List = [] + # handle command line arguments parser = argparse.ArgumentParser( description="Recipe Crawler to that saves a cookbook to a JSON file." ) @@ -809,15 +847,20 @@ def arg_in_list(a): for source in website_sources_list: # logger.debug('KEYS: {}'.format(source['site'].keys())) - logger.debug(f"Adding crawler for: {source['site']['url']}") - mc.add_crawler( - source["site"]["url"], - source["site"].get("recipe_url"), - source["site"].get("license"), - # this is a URL the one that most-likely to get to recipes quickly, such as an index or landing pages for recipes. - source["site"].get("start_url"), - source["site"].get("title"), - ) + try: + logger.debug(f"Adding crawler for: {source['site']['url']}") + mc.add_crawler( + source["site"]["url"], + source["site"].get("recipe_url"), + source["site"].get("license"), + # URL the one that most-likely to get to recipes quickly, such as an index or landing pages for recipes. + source["site"].get("start_url"), + source["site"].get("title"), + ) + except NotImplementedError as e: + logger.debug( + f"Skipping crawler : '{source['site']['url']}' due to a NotImplementedError '{e}'" + ) mc.run() @@ -827,7 +870,7 @@ def arg_in_list(a): with open(args.output, "w") as fp: json.dump(recipes_dict, fp) - license_filename = f"license-{args.output[:-5]}.md" + license_filename = f"licenses-{args.output[:-5]}.md" with open(license_filename, "w") as fp: fp.write(mc.generate_license_report()) @@ -842,7 +885,7 @@ def arg_in_list(a): logger.info(" * Metric is not accurate.") runtime = pendulum.now().diff(start_time) # README.md row printer, INCOMPLETE -# logger.info(f"row: | {__version__} | {args.limit} | | {mc.results_num_get_calls()} | | {mc.results_num_get_calls() / args.limit} | |") + # logger.info(f"row: | {__version__} | {args.limit} | | {mc.results_num_get_calls()} | | {mc.results_num_get_calls() / args.limit} | |") logger.info(f"Program's Runtime: {runtime.in_words()}") diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..2b56e4c --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,4 @@ +# Development related requirements including types +types-beautifulsoup4 +types-PyYAML +types-requests diff --git a/website_source-open_source.yaml b/website_source-open_source.yaml index 4175ce2..38ad4b3 100644 --- a/website_source-open_source.yaml +++ b/website_source-open_source.yaml @@ -41,4 +41,11 @@ site: url: https://healthyeating.nhlbi.nih.gov/ title: NIH Healthy Eating recipe_url: https://healthyeating.nhlbi.nih.gov/recipedetail.aspx? - license: https://www.nhlbi.nih.gov/about/contact/trademark-branding-and-logo \ No newline at end of file + license: https://www.nhlbi.nih.gov/about/contact/trademark-branding-and-logo +--- +site: + url: https://www.foodista.com/ + title: Foodista + start_url: https://www.foodista.com/blog/recipes-cooking + recipe_url: https://www.foodista.com/recipe/ + license: http://creativecommons.org/licenses/by/3.0/ diff --git a/website_sources.yaml b/website_sources.yaml index 8543592..2901b38 100644 --- a/website_sources.yaml +++ b/website_sources.yaml @@ -71,3 +71,10 @@ site: start_url: https://medlineplus.gov/recipes/ recipe_url: https://medlineplus.gov/recipes/ license: https://www.nlm.nih.gov/web_policies.html#copyright +--- +site: + url: https://www.foodista.com/ + title: Foodista + start_url: https://www.foodista.com/blog/recipes-cooking + recipe_url: https://www.foodista.com/recipe/ + license: http://creativecommons.org/licenses/by/3.0/ From f07ecef59ed6c05aacf61f996019b201566c8b33 Mon Sep 17 00:00:00 2001 From: "Micah D. Cochran" Date: Tue, 26 Sep 2023 16:07:43 -0500 Subject: [PATCH 2/2] update dependencies to newer versions --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index ce62fb7..60041a4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ beautifulsoup4 loguru -recipe-scrapers>=13.3.3 +recipe-scrapers>=14.48.0 requests -scrape-schema-recipe>=0.1.3 +scrape-schema-recipe>=0.2.2 pendulum PyYAML