Skip to content

Commit

Permalink
instructions from recipe-scrapers is a list, edit return types (#8)
Browse files Browse the repository at this point in the history
  • Loading branch information
micahcochran authored Aug 13, 2021
1 parent b6e4818 commit 0d0db21
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 21 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ OR Install the python libraries individually:
* [Pendulum](https://pendulum.eustace.io/)
* [PyYAML](https://pyyaml.org/)
* [requests](https://docs.python-requests.org/)
* [reppy](https://github.com/seomoz/reppy)
* [recipe-scrapers](https://github.com/hhursev/recipe-scrapers)
* [scrape-schema-recipe](https://github.com/micahcochran/scrape-schema-recipe)

Expand All @@ -61,6 +62,7 @@ See the [LICENCE](LICENCE) file for terms.

| Version | Number of Recipes | Minutes:Seconds 🠗 | # Webpages DLed | Derived ----> | webpages DLed/recipe 🠗 | seconds/recipe 🠗 |
| :------ | :---------------: | :---------------: | :-------------: | ------------- | :--------------------: | :-------------: |
| 0.2.1 | 20 | 1:55 | 61 | | 3 | 5.8 |
| 0.2.0-pre | 20 | 1:28 | 51* | | 2.6 | 4.4 |
| 0.1.0 | 20 | 1:16 | 39 | | 2 | 3.8 |
| 0.0.2 | 20 | 4:00 | 79 | | 4 | 12 |
Expand Down
39 changes: 18 additions & 21 deletions recipe_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,10 @@
# * Make the CLI a have better options that are more POSIX like.

# ----- Python native imports -----
import copy
from datetime import datetime, timedelta
from io import StringIO
from itertools import cycle
import json
import logging
import platform
from random import randint
import os.path
Expand All @@ -83,7 +81,7 @@
# Use this typing syntax.
# Python 3.9 allows builtins (dict, list) to be used
# In Python 3.7 and 3.8, use `from __future__ import annotations`
from typing import Dict, List, Tuple
from typing import Dict, List, Tuple, Union

# import urllib.robotparser
import urllib.parse
Expand All @@ -100,15 +98,14 @@
# from reppy.robots import Robots
import reppy
import requests
from requests.models import Response
import scrape_schema_recipe
import yaml

# Flag slows down the execution of the program so that it is enough time to be able to react if it
# isn't performing correctly.
SLOW_DEBUG = True

__version__ = "0.2.0-pre"
__version__ = "0.2.1"
# This is the user-agent
USER_AGENT_ROBOTS = "RecipeCrawlerPY"
USER_AGENT = f"RecipeCrawlerPY/{__version__}"
Expand Down Expand Up @@ -143,7 +140,7 @@ def add_crawler(
license=None,
start_url: str = "",
site_title: str = "",
):
) -> None:
if "myplate.gov" in url or "healthyeating.nhlbi.nih.gov" in url:
self.crawlers.append(
RecipeScrapersCrawler(url, recipe_url, license, start_url, site_title)
Expand All @@ -167,7 +164,7 @@ def remove_crawler(self, base_url) -> bool:

return False

def run(self):
def run(self) -> None:
"""run crawlers sequentially until enough recipes are collected"""
# loop until the recipe limit is reached or there are no more crawlers left
while self.num_recipes < self.recipe_limit or len(self.crawlers) < 1:
Expand Down Expand Up @@ -239,7 +236,7 @@ def __init__(

# self._url_list_high and self._url_list_low are the crawler frontier
# start_url and url are seeds
# list of urls that have a better change to have recipes
# list of urls that have a better chance to have recipes
self._url_list_high: List = []
if start_url:
logger.debug(f"Add start_url: {start_url}")
Expand Down Expand Up @@ -341,7 +338,7 @@ def crawl(self) -> int:
return False
"""

def _has_similar_recipe(self, recipe: Dict) -> bool:
def _has_similar_recipe(self, recipe: Dict) -> int:
"""Test if there is already a similar recipe
return the index number in self.recipe_json of a similar
Expand All @@ -366,7 +363,7 @@ def _has_similar_recipe(self, recipe: Dict) -> bool:
logger.debug("No similar recipe.")
return -1

def _download_page(self):
def _download_page(self) -> requests.models.Response:
"""
Get an url from the list and download a webpage
Expand Down Expand Up @@ -403,7 +400,7 @@ def _download_page(self):
def _scrape_page(self, response: requests.Response):
"""
scrapes a page
input: response is a requests.Response object from requests.get()
input: response is a requests.models.Response object from requests.get()
return dict or List, if it is empty it will be None
"""
Expand All @@ -424,7 +421,7 @@ def _scrape_page(self, response: requests.Response):

return None

def _mine_anchors(self, response: requests.Response):
def _mine_anchors(self, response: requests.models.Response) -> None:
"""Mines anchors from the webpage response.
This takes all of the anchors and evaluates them and places them into the proper lists."""
Expand Down Expand Up @@ -582,10 +579,10 @@ def __init__(

super().__init__(url, recipe_url, license, start_url, site_title)

def _scrape_page(self, response: requests.Response):
def _scrape_page(self, response: requests.models.Response) -> Dict:
"""
scrapes a page
input: response is a requests.Response object from requests.get()
input: response is a requests.models.Response object from requests.get()
return dict or List, if it is empty it will be None
"""
Expand All @@ -600,7 +597,7 @@ def _scrape_page(self, response: requests.Response):
return dict

# TODO: This function will need to be rewritten based on how exceptions work in recipe-scrapers versions after 13.3.0
def _convert_recipe_scraper_to_schema_dict(self, rs, url: str) -> Dict:
def _convert_recipe_scraper_to_schema_dict(self, rs, url: str) -> Union[Dict, None]:
"""Convert recipe-scraper object into a recipe schema dictionary"""
logger.debug("called _convert_recipe_scraper_to_schema_dict()")
d = {}
Expand All @@ -615,8 +612,8 @@ def _convert_recipe_scraper_to_schema_dict(self, rs, url: str) -> Dict:

d["name"] = rs.title()

# this is the text version of the tag
d["recipeInstructions"] = rs.instructions()
# this is the list version of the tag
d["recipeInstructions"] = rs.instructions().split("\n")

try:
if rs.total_time():
Expand Down Expand Up @@ -685,7 +682,7 @@ def __init__(self, message):
self.message = message
super().__init__(message)

def __str__(self):
def __str__(self) -> str:
return f"recipe_crawler exception: {self.message}"


Expand All @@ -696,7 +693,7 @@ def __init__(self, message):
self.message = message
super().__init__(message)

def __str__(self):
def __str__(self) -> str:
return f"AnchorListsEmptyError: {self.message}"


Expand All @@ -708,7 +705,7 @@ def __init__(self, other_err):
super().__init__(message)


def usage():
def usage() -> str:
prompt = "$"
if platform.system() == "Windows":
prompt = "C:\..>"
Expand Down Expand Up @@ -797,7 +794,7 @@ def arg_in_list(arg):
logger.info(f"Number of web pages downloaded: {mc.results_num_get_calls()}")

logger.info(
f"Number of HTML bytes downloaded: {mc.results_num_bytes_html_downloaded()/2**20} MiB"
f"Number of HTML bytes downloaded: {mc.results_num_bytes_html_downloaded()/2**20:.3f} MiB"
)

runtime_str = pendulum.now().diff(start_time).in_words()
Expand Down

0 comments on commit 0d0db21

Please sign in to comment.