-
Notifications
You must be signed in to change notification settings - Fork 1
/
recipeFetcher.py
169 lines (134 loc) · 7.2 KB
/
recipeFetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
from bs4 import BeautifulSoup
import string
import requests
import re
class RecipeFetcher:
search_base_url = 'https://www.allrecipes.com/search/results/?wt=%s&sort=re'
# todo: move to constants
units = {}
def search_recipes(self, keywords):
search_url = self.search_base_url % (keywords.replace(' ', '+'))
page_html = requests.get(search_url)
page_graph = BeautifulSoup(page_html.content, features="html.parser")
return [recipe.a['href'] for recipe in page_graph.find_all('div', {'class': 'grid-card-image-container'})]
def scrape_recipe(self, recipe_url):
results = {}
page_html = requests.get(recipe_url)
page_graph = BeautifulSoup(page_html.content, features="html.parser")
results['ingredients'] = [ingredient.text for ingredient in \
page_graph.find_all('span', {'itemprop': 'recipeIngredient'})]
# different ingredient format, attempt to populate array with new format
if not results['ingredients']:
results['ingredients'] = [ingredient.text.strip("\n").strip() for ingredient in \
page_graph.find_all('span', {'class': 'ingredients-item-name'})]
results['directions'] = [direction.text.strip() for direction in \
page_graph.find_all('span', {'class': 'recipe-directions__list--item'})
if direction.text.strip()]
# different direction format, attempt to populate array with new format
if not results['directions']:
results['directions'] = [str(direct.find('p').text.rstrip("\n")).strip() for direct in
page_graph.find_all('li', {'class': 'subcontainer instructions-section-item'})]
results['nutrition'] = self.scrape_nutrition_facts(recipe_url)
return results
def scrape_nutrition_facts(self, recipe_url):
results = []
nutrition_facts_url = f'{recipe_url}/fullrecipenutrition'
page_html = requests.get(nutrition_facts_url)
page_graph = BeautifulSoup(page_html.content, features="html.parser")
nutrition_rows = page_graph.find_all('div', {'class': 'nutrition-row'})
r = re.compile("([0-9]*\.?[0-9]*)([a-zA-Z]+)")
for nutrient_row in nutrition_rows:
# todo: clean up sanitize and strip
nutrient = {}
nutrient_row_children = nutrient_row.text.split(':')
nutrient['name'] = nutrient_row_children[0].strip()
split_amount_and_value = nutrient_row_children[1].split('\n')
# extract unit and amount
nutrient['amount'] = self.extract_numbers(split_amount_and_value[0])
nutrient['unit'] = self.extract_unit(split_amount_and_value[0])
# nutrient['daily value'] = split_amount_and_value[1].split(' ')[0] or '0'
nutrient['daily value'] = split_amount_and_value[1] or None
# strip all new lines from our values
nutrient = {key: str(val).strip() if val is not None else None for key, val in nutrient.items()}
results.append(nutrient)
return results
def extract_numbers(self, text):
try:
return re.findall(r"\d+\.\d+|\d+", text)[0]
except IndexError:
return 0
def extract_unit(self, text):
try:
return re.split(r"\d+\.\d+|\d+", text)[1]
except IndexError:
return ''
if __name__ == '__main__':
rf = RecipeFetcher()
meat_lasagna = rf.search_recipes('meat lasagna')[0]
results = rf.scrape_recipe(meat_lasagna)
#print(results)
print("INGREDIENTS:")
for ingredient in results['ingredients']:
print(ingredient)
print("\n")
print("DIRECTIONS:")
count = 1
for direction in results['directions']:
direction = direction.replace("Watch Now", "")
print(str(count) + ". " + direction)
count = count + 1
#don't need to print nutritional info
# print("\n")
# print("NUTRITION:")
# for nutrient_row in results['nutrition']:
# print(nutrient_row)
"""
Should return:
{'ingredients': ['12 whole wheat lasagna noodles',
'1 pound lean ground beef',
'2 cloves garlic, chopped',
'1/2 teaspoon garlic powder',
'1 teaspoon dried oregano, or to taste',
'salt and ground black pepper to taste',
'1 (16 ounce) package cottage cheese',
'2 eggs',
'1/2 cup shredded Parmesan cheese',
'1 1/2 (25 ounce) jars tomato-basil pasta sauce',
'2 cups shredded mozzarella cheese'],
'directions': ['Preheat oven to 350 degrees F (175 degrees C).\n Watch Now',
'Fill a large pot with lightly salted water and bring to a rolling boil over high heat. Once the water is boiling, add the lasagna noodles a few at a time, and return to a boil. Cook the pasta uncovered, stirring occasionally, until the pasta has cooked through, but is still firm to the bite, about 10 minutes. Remove the noodles to a plate.\n Watch Now',
'Place the ground beef into a skillet over medium heat, add the garlic, garlic powder, oregano, salt, and black pepper to the skillet. Cook the meat, chopping it into small chunks as it cooks, until no longer pink, about 10 minutes. Drain excess grease.\n Watch Now',
'In a bowl, mix the cottage cheese, eggs, and Parmesan cheese until thoroughly combined.\n Watch Now',
'Place 4 noodles side by side into the bottom of a 9x13-inch baking pan; top with a layer of the tomato-basil sauce, a layer of ground beef mixture, and a layer of the cottage cheese mixture. Repeat layers twice more, ending with a layer of sauce; sprinkle top with the mozzarella cheese. Cover the dish with aluminum foil.\n Watch Now',
'Bake in the preheated oven until the casserole is bubbling and the cheese has melted, about 30 minutes. Remove foil and bake until cheese has begun to brown, about 10 more minutes. Allow to stand at least 10 minutes before serving.\n Watch Now'],
'nutrition': [{'name': 'Total Fat',
'amount': '19.3',
'unit': 'g',
'daily_value': '30 %'},
{'name': 'Saturated Fat', 'amount': '9.0', 'unit': 'g', 'daily_value': None},
{'name': 'Cholesterol',
'amount': '115',
'unit': 'mg',
'daily_value': '38 %'},
{'name': 'Sodium', 'amount': '999', 'unit': 'mg', 'daily_value': '40 %'},
{'name': 'Potassium', 'amount': '717', 'unit': 'mg', 'daily_value': '20 %'},
{'name': 'Total Carbohydrates',
'amount': '47.1',
'unit': 'g',
'daily_value': '15 %'},
{'name': 'Dietary Fiber',
'amount': '6.3',
'unit': 'g',
'daily_value': '25 %'},
{'name': 'Protein', 'amount': '35.6', 'unit': 'g', 'daily_value': '71 %'},
{'name': 'Sugars', 'amount': '12', 'unit': 'g', 'daily_value': None},
{'name': 'Vitamin A', 'amount': '855', 'unit': 'IU', 'daily_value': None},
{'name': 'Vitamin C', 'amount': '2', 'unit': 'mg', 'daily_value': None},
{'name': 'Calcium', 'amount': '361', 'unit': 'mg', 'daily_value': None},
{'name': 'Iron', 'amount': '4', 'unit': 'mg', 'daily_value': None},
{'name': 'Thiamin', 'amount': '0', 'unit': 'mg', 'daily_value': None},
{'name': 'Niacin', 'amount': '11', 'unit': 'mg', 'daily_value': None},
{'name': 'Vitamin B6', 'amount': '0', 'unit': 'mg', 'daily_value': None},
{'name': 'Magnesium', 'amount': '74', 'unit': 'mg', 'daily_value': None},
{'name': 'Folate', 'amount': '41', 'unit': 'mcg', 'daily_value': None}]}
"""