From 492ee08a55fba6f1300295e328fbca60479a8dca Mon Sep 17 00:00:00 2001 From: Joey Erskine Date: Mon, 24 Jun 2013 14:04:12 -0500 Subject: [PATCH] fixes #76 adds tablespoon spider --- .../openrecipes/spiders/tablespoon_spider.py | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 scrapy_proj/openrecipes/spiders/tablespoon_spider.py diff --git a/scrapy_proj/openrecipes/spiders/tablespoon_spider.py b/scrapy_proj/openrecipes/spiders/tablespoon_spider.py new file mode 100644 index 0000000..b36779e --- /dev/null +++ b/scrapy_proj/openrecipes/spiders/tablespoon_spider.py @@ -0,0 +1,56 @@ +from scrapy.contrib.spiders import CrawlSpider, Rule +from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor +from scrapy.selector import HtmlXPathSelector +from openrecipes.items import RecipeItem, RecipeItemLoader + + +class TablespoonMixin(object): + source = 'tablespoon' + + def parse_item(self, response): + + hxs = HtmlXPathSelector(response) + + base_path = '//*[@class="recipe-area hrecipe"]' + + recipes_scopes = hxs.select(base_path) + + name_path = '//*[@class="fn"]/text()' + description_path = '//*[@class="summary"]/text()' + image_path = '//*[@class="photo"]/@src' + recipeYield_path = '//*[@class="servings"]/text()' + + recipes = [] + + for r_scope in recipes_scopes: + il = RecipeItemLoader(item=RecipeItem()) + + il.add_value('source', self.source) + + il.add_value('name', r_scope.select(name_path).extract()) + il.add_value('image', r_scope.select(image_path).extract()) + il.add_value('url', response.url) + il.add_value('description', r_scope.select(description_path).extract()) + il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) + + recipes.append(il.load_item()) + + return recipes + + +class TablespooncrawlSpider(CrawlSpider, TablespoonMixin): + + name = "tablespoon.com" + + allowed_domains = ["tablespoon.com"] + + start_urls = [ + "http://www.tablespoon.com/search/#&page_type=", + ] + + rules = ( + Rule(SgmlLinkExtractor(allow=('/search/#&page_type=&sort=&page=\d+'))), + + Rule(SgmlLinkExtractor(allow=('/recipes/\[a-z]+/\d+/')), + callback='parse_item'), + )