diff --git a/scrapy_proj/openrecipes/spiders/cookstr_spider.py b/scrapy_proj/openrecipes/spiders/cookstr_spider.py new file mode 100644 index 0000000..6a935bd --- /dev/null +++ b/scrapy_proj/openrecipes/spiders/cookstr_spider.py @@ -0,0 +1,73 @@ +from scrapy.contrib.spiders import CrawlSpider, Rule +from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor +from scrapy.selector import HtmlXPathSelector +from openrecipes.items import RecipeItem, RecipeItemLoader + + +class CookstrMixin(object): + source = 'cookstr' + + def parse_item(self, response): + + hxs = HtmlXPathSelector(response) + + base_path = '//*[@itemtype="http://schema.org/Recipe"]' + + recipes_scopes = hxs.select(base_path) + + name_path = '//*[@class="recipe-title"]/text()' + description_path = '//span[@class="recipe_structure_headnotes"]/p/text()' + #formating odd for image, so must concatenate url to beginning of image path + image_path = 'concat("http://www.cookstr.com", //*[@itemprop="image"]/@src)' + prepTime_path = 'id("recipe_body")/div[4]/span/text()' + #for some formatting, the info won't display. + cookTime_path = 'id("recipe_body")/div[5]/span/text()' + recipeYield_path = '//*[@itemprop="recipeYield"]/text()' + ingredients_path = '//*[@itemprop="ingredients"]//text()' + + recipes = [] + + for r_scope in recipes_scopes: + il = RecipeItemLoader(item=RecipeItem()) + + il.add_value('source', self.source) + + il.add_value('name', r_scope.select(name_path).extract()) + il.add_value('image', r_scope.select(image_path).extract()) + il.add_value('url', response.url) + il.add_value('description', r_scope.select(description_path).extract()) + + il.add_value('prepTime', r_scope.select(prepTime_path).extract()) + il.add_value('cookTime', r_scope.select(cookTime_path).extract()) + il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) + + ingredients = [] + ingredient_scopes = r_scope.select(ingredients_path) + for ingredient_scope in ingredient_scopes: + ingredient = ingredient_scope.extract().strip() + if (ingredient): + ingredients.append(ingredient) + il.add_value('ingredients', ingredients) + + recipes.append(il.load_item()) + + return recipes + + +class CookstrcrawlSpider(CrawlSpider, CookstrMixin): + + name = "www.cookstr.com" + + allowed_domains = ["www.cookstr.com"] + + start_urls = [ + #resulting url when hitting enter into search with nothing searching + "http://www.cookstr.com/searches?," + ] + + rules = ( + Rule(SgmlLinkExtractor(allow=('/searches?page=\d+'))), + + Rule(SgmlLinkExtractor(allow=('/recipes/\[a-z-]+')), + callback='parse_item'), + ) diff --git a/scrapy_proj/openrecipes/spiders/dashingdish_feedspider.py b/scrapy_proj/openrecipes/spiders/dashingdish_feedspider.py new file mode 100644 index 0000000..333806e --- /dev/null +++ b/scrapy_proj/openrecipes/spiders/dashingdish_feedspider.py @@ -0,0 +1,24 @@ +from scrapy.spider import BaseSpider +from scrapy.http import Request +from scrapy.selector import XmlXPathSelector +from openrecipes.spiders.dashingdish_spider import Dashingdish_spiderMixin + + +class DashingdishfeedSpider(BaseSpider, Dashingdish_spiderMixin): + + name = "dashingdish.feed" + allowed_domains = [ + "dashingdish.com", + "feeds.feedburner.com", + "feedproxy.google.com" + ] + start_urls = [ + "http://feeds.feedburner.com/dashingdish-recipes", + ] + + def parse(self, response): + + xxs = XmlXPathSelector(response) + links = xxs.select("//item/*[local-name()='origLink']/text()").extract() + + return [Request(x, callback=self.parse_item) for x in links] diff --git a/scrapy_proj/openrecipes/spiders/davidlebovitz_feedspider.py b/scrapy_proj/openrecipes/spiders/davidlebovitz_feedspider.py new file mode 100644 index 0000000..b6dd3c6 --- /dev/null +++ b/scrapy_proj/openrecipes/spiders/davidlebovitz_feedspider.py @@ -0,0 +1,24 @@ +from scrapy.spider import BaseSpider +from scrapy.http import Request +from scrapy.selector import XmlXPathSelector +from openrecipes.spiders.davidlebovitz_spider import DavidlebovitzMixin + + +class DavidlebovitzfeedSpider(BaseSpider, DavidlebovitzMixin): + + name = "davidlebovitz.feed" + allowed_domains = [ + "davidlebovitz.com", + "feeds.feedburner.com", + "feedproxy.google.com" + ] + start_urls = [ + "http://feeds.feedburner.com/davidlebovitz/blog", + ] + + def parse(self, response): + + xxs = XmlXPathSelector(response) + links = xxs.select("//item/*[local-name()='origLink']/text()").extract() + + return [Request(x, callback=self.parse_item) for x in links] diff --git a/scrapy_proj/openrecipes/spiders/fortheloveofcooking_feedspider.py b/scrapy_proj/openrecipes/spiders/fortheloveofcooking_feedspider.py new file mode 100644 index 0000000..c6127b6 --- /dev/null +++ b/scrapy_proj/openrecipes/spiders/fortheloveofcooking_feedspider.py @@ -0,0 +1,24 @@ +from scrapy.spider import BaseSpider +from scrapy.http import Request +from scrapy.selector import XmlXPathSelector +from openrecipes.spiders.fortheloveofcooking_spider import FortheloveofcookingMixin + + +class FortheloveofcookingfeedSpider(BaseSpider, FortheloveofcookingMixin): + + name = "fortheloveofcooking.feed" + allowed_domains = [ + "fortheloveofcooking.net", + "feeds.feedburner.com", + "feedproxy.google.com" + ] + start_urls = [ + "http://feeds.feedburner.com/blogspot/OlvyH", + ] + + def parse(self, response): + + xxs = XmlXPathSelector(response) + links = xxs.select("//item/*[local-name()='origLink']/text()").extract() + + return [Request(x, callback=self.parse_item) for x in links] diff --git a/scrapy_proj/openrecipes/spiders/marthastewart_spider.py b/scrapy_proj/openrecipes/spiders/marthastewart_spider.py new file mode 100644 index 0000000..a11a07d --- /dev/null +++ b/scrapy_proj/openrecipes/spiders/marthastewart_spider.py @@ -0,0 +1,77 @@ +from scrapy.contrib.spiders import CrawlSpider, Rule +from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor +from scrapy.selector import HtmlXPathSelector +from openrecipes.items import RecipeItem, RecipeItemLoader + + +class MarthastewartMixin(object): + source = 'marthastewart' + + def parse_item(self, response): + + hxs = HtmlXPathSelector(response) + + base_path = '//*[@class="hrecipe"]' + + recipes_scopes = hxs.select(base_path) + + name_path = '//*[@class="title fn"]/text()' + description_path = '//*[@class="expand-body"]/text()' + #formating odd, so much concatenate base url with image + image_path = 'concat("http://www.marthastewart.com", //*[@class="img-l photo"]/@src)' + prepTime_path = '//li[1]/span/span/@title' + cookTime_path = '//li[2]/span/span/@title' + recipeYield_path = '//*[@class="yield"]/text()[2]' + ingredients_path = '//*[@class="ingredient"]/text()' + datePublished = '//*[@class="recipe-info"]/cite/text()' + + recipes = [] + + for r_scope in recipes_scopes: + il = RecipeItemLoader(item=RecipeItem()) + + il.add_value('source', self.source) + + il.add_value('name', r_scope.select(name_path).extract()) + il.add_value('image', r_scope.select(image_path).extract()) + il.add_value('url', response.url) + il.add_value('description', r_scope.select(description_path).extract()) + + il.add_value('prepTime', r_scope.select(prepTime_path).extract()) + il.add_value('cookTime', r_scope.select(cookTime_path).extract()) + il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) + + ingredient_scopes = r_scope.select(ingredients_path) + + ingredients = [] + ingredient_scopes = r_scope.select(ingredients_path) + for ingredient_scope in ingredient_scopes: + ingredient = ingredient_scope.extract().strip() + if (ingredient): + ingredients.append(ingredient) + il.add_value('ingredients', ingredients) + + il.add_value('datePublished', r_scope.select(datePublished).extract()) + + recipes.append(il.load_item()) + + return recipes + + +class MarthastewartcrawlSpider(CrawlSpider, MarthastewartMixin): + + name = "www.marthastewart.com" + + allowed_domains = ["www.marthastewart.com"] + + start_urls = [ + "http://www.marthastewart.com/search/apachesolr_search/recipe" + + ] + + rules = ( + Rule(SgmlLinkExtractor(allow=('/search/apachesolr_search/recipe?page=/d+'))), + + Rule(SgmlLinkExtractor(allow=('/\d+/[a-z-]+')), + callback='parse_item'), + ) diff --git a/scrapy_proj/openrecipes/spiders/mybakingaddiction_feedspider.py b/scrapy_proj/openrecipes/spiders/mybakingaddiction_feedspider.py new file mode 100644 index 0000000..a331f43 --- /dev/null +++ b/scrapy_proj/openrecipes/spiders/mybakingaddiction_feedspider.py @@ -0,0 +1,25 @@ +from scrapy.spider import BaseSpider +from scrapy.http import Request +from scrapy.selector import XmlXPathSelector +from openrecipes.spiders.mybakingaddiction_spider import MybakingaddictionMixin + + +class MybakingaddictionfeedSpider(BaseSpider, MybakingaddictionMixin): + + name = "mybakingaddiction.feed" + allowed_domains = [ + "mybakingaddiction.com", + "feeds.feedburner.com", + "feedproxy.google.com" + ] + start_urls = [ + "http://feeds.feedburner.com/mybakingaddiction", + ] + + def parse(self, response): + + xxs = XmlXPathSelector(response) + #check later + links = xxs.select("//item/*[local-name()='origLink']/text()").extract() + + return [Request(x, callback=self.parse_item) for x in links] diff --git a/scrapy_proj/openrecipes/spiders/simplyrecipe_feedspider.py b/scrapy_proj/openrecipes/spiders/simplyrecipe_feedspider.py new file mode 100644 index 0000000..8c83b72 --- /dev/null +++ b/scrapy_proj/openrecipes/spiders/simplyrecipe_feedspider.py @@ -0,0 +1,24 @@ +from scrapy.spider import BaseSpider +from scrapy.http import Request +from scrapy.selector import XmlXPathSelector +from openrecipes.spiders.simplyrecipes_spider import SimplyrecipesMixin + + +class SimplyrecipesfeedSpider(BaseSpider, SimplyrecipesMixin): + + name = "simplyrecipes.feed" + allowed_domains = [ + "simplyrecipes.com", + "feeds.feedburner.com", + "feedproxy.google.com" + ] + start_urls = [ + "http://feeds.feedburner.com/SimplyRecipesRecipesOnly", + ] + + def parse(self, response): + + xxs = XmlXPathSelector(response) + links = xxs.select("//item/*[local-name()='origLink']/text()").extract() + + return [Request(x, callback=self.parse_item) for x in links] diff --git a/scrapy_proj/openrecipes/spiders/spider_name_spider.py b/scrapy_proj/openrecipes/spiders/simplyrecipes_spider.py similarity index 55% rename from scrapy_proj/openrecipes/spiders/spider_name_spider.py rename to scrapy_proj/openrecipes/spiders/simplyrecipes_spider.py index 4fbc529..d28a24a 100644 --- a/scrapy_proj/openrecipes/spiders/spider_name_spider.py +++ b/scrapy_proj/openrecipes/spiders/simplyrecipes_spider.py @@ -4,26 +4,25 @@ from openrecipes.items import RecipeItem, RecipeItemLoader -class Spider_nameMixin(object): - source = 'spider_name' +class SimplyrecipesMixin(object): + source = 'simplyrecipes' def parse_item(self, response): hxs = HtmlXPathSelector(response) - base_path = 'TODO' + base_path = '//*[@itemtype="http://schema.org/Recipe"]' recipes_scopes = hxs.select(base_path) - name_path = 'TODO' - description_path = 'TODO' - image_path = 'TODO' - prepTime_path = 'TODO' - cookTime_path = 'TODO' - recipeYield_path = 'TODO' - ingredients_path = 'TODO' - datePublished = 'TODO' - + name_path = '//*[@class="recipe-callout"]/h2/text()' + description_path = './/*[@id="recipe-intronote"]/p/text()' + image_path = '//*[@itemprop="image"]/@src' + prepTime_path = './/span[@itemprop="prepTime"]/span/@title' + cookTime_path = './/span[@itemprop="cookTime"]/span/@title' + recipeYield_path = '//*[@itemprop="recipeYield"]/text()' + ingredients_path = '//*[@itemprop="ingredients"]/text()' + datePublished = '//*[@class="entry-date"]/text()' recipes = [] for r_scope in recipes_scopes: @@ -40,10 +39,12 @@ def parse_item(self, response): il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) - ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] - for i_scope in ingredient_scopes: - pass + ingredient_scopes = r_scope.select(ingredients_path) + for ingredient_scope in ingredient_scopes: + ingredient = ingredient_scope.extract().strip() + if (ingredient): + ingredients.append(ingredient) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) @@ -53,19 +54,19 @@ def parse_item(self, response): return recipes -class Spider_namecrawlSpider(CrawlSpider, Spider_nameMixin): +class SimplyrecipescrawlSpider(CrawlSpider, SimplyrecipesMixin): - name = "START_URL" + name = "simplyrecipes.com" - allowed_domains = ["START_URL"] + allowed_domains = ["simplyrecipes.com"] start_urls = [ - "START_URL", + "http://www.simplyrecipes.com/index/", ] rules = ( - Rule(SgmlLinkExtractor(allow=('TODO'))), + Rule(SgmlLinkExtractor(allow=('/recipes/ingredient/[a-z-]+/'))), - Rule(SgmlLinkExtractor(allow=('TODO')), + Rule(SgmlLinkExtractor(allow=('/recipes/[a-z_]+/')), callback='parse_item'), ) diff --git a/scrapy_proj/openrecipes/spiders/twopeasandtheirpod_feedspider.py b/scrapy_proj/openrecipes/spiders/twopeasandtheirpod_feedspider.py new file mode 100644 index 0000000..588285b --- /dev/null +++ b/scrapy_proj/openrecipes/spiders/twopeasandtheirpod_feedspider.py @@ -0,0 +1,25 @@ +from scrapy.spider import BaseSpider +from scrapy.http import Request +from scrapy.selector import XmlXPathSelector +from openrecipes.spiders.twopeasandtheirpod_spider import TwopeasandtheirpodMixin + + +class TwopeasandtheirpodfeedSpider(BaseSpider, TwopeasandtheirpodMixin): + + name = "twopeasandtheirpod.feed" + + allowed_domains = [ + "twopeasandtheirpod.com", + "feeds.feedburner.com", + "feedproxy.google.com" + ] + start_urls = [ + "http://feeds.feedburner.com/twopeasandtheirpod/rNNF", + ] + + def parse(self, response): + + xxs = XmlXPathSelector(response) + links = xxs.select("//item/*[local-name()='origLink']/text()").extract() + + return [Request(x, callback=self.parse_item) for x in links]