fictive-kin · jterskine · Jun 6, 2013 · Jun 6, 2013 · Jun 7, 2013
diff --git a/scrapy_proj/openrecipes/spiders/cookstr_spider.py b/scrapy_proj/openrecipes/spiders/cookstr_spider.py
@@ -0,0 +1,73 @@
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.selector import HtmlXPathSelector
+from openrecipes.items import RecipeItem, RecipeItemLoader
+
+
+class CookstrMixin(object):
+    source = 'cookstr'
+
+    def parse_item(self, response):
+
+        hxs = HtmlXPathSelector(response)
+
+        base_path = '//*[@itemtype="http://schema.org/Recipe"]'
+
+        recipes_scopes = hxs.select(base_path)
+
+        name_path = '//*[@class="recipe-title"]/text()'
+        description_path = '//span[@class="recipe_structure_headnotes"]/p/text()'
+        #formating odd for image, so must concatenate url to beginning of image path
+        image_path = 'concat("http://www.cookstr.com", //*[@itemprop="image"]/@src)'
+        prepTime_path = 'id("recipe_body")/div[4]/span/text()'
+        #for some formatting, the info won't display.
+        cookTime_path = 'id("recipe_body")/div[5]/span/text()'
+        recipeYield_path = '//*[@itemprop="recipeYield"]/text()'
+        ingredients_path = '//*[@itemprop="ingredients"]//text()'
+
+        recipes = []
+
+        for r_scope in recipes_scopes:
+            il = RecipeItemLoader(item=RecipeItem())
+
+            il.add_value('source', self.source)
+
+            il.add_value('name', r_scope.select(name_path).extract())
+            il.add_value('image', r_scope.select(image_path).extract())
+            il.add_value('url', response.url)
+            il.add_value('description', r_scope.select(description_path).extract())
+
+            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
+            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
+            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())
+
+            ingredients = []
+            ingredient_scopes = r_scope.select(ingredients_path)
+            for ingredient_scope in ingredient_scopes:
+                ingredient = ingredient_scope.extract().strip()
+                if (ingredient):
+                    ingredients.append(ingredient)
+            il.add_value('ingredients', ingredients)
+
+            recipes.append(il.load_item())
+
+        return recipes
+
+
+class CookstrcrawlSpider(CrawlSpider, CookstrMixin):
+
+    name = "www.cookstr.com"
+
+    allowed_domains = ["www.cookstr.com"]
+
+    start_urls = [
+        #resulting url when hitting enter into search with nothing searching
+        "http://www.cookstr.com/searches?,"
+    ]
+
+    rules = (
+        Rule(SgmlLinkExtractor(allow=('/searches?page=\d+'))),
+
+        Rule(SgmlLinkExtractor(allow=('/recipes/\[a-z-]+')),
+             callback='parse_item'),
+    )
diff --git a/scrapy_proj/openrecipes/spiders/dashingdish_feedspider.py b/scrapy_proj/openrecipes/spiders/dashingdish_feedspider.py
@@ -0,0 +1,24 @@
+from scrapy.spider import BaseSpider
+from scrapy.http import Request
+from scrapy.selector import XmlXPathSelector
+from openrecipes.spiders.dashingdish_spider import Dashingdish_spiderMixin
+
+
+class DashingdishfeedSpider(BaseSpider, Dashingdish_spiderMixin):
+
+    name = "dashingdish.feed"
+    allowed_domains = [
+        "dashingdish.com",
+        "feeds.feedburner.com",
+        "feedproxy.google.com"
+    ]
+    start_urls = [
+        "http://feeds.feedburner.com/dashingdish-recipes",
+    ]
+
+    def parse(self, response):
+
+        xxs = XmlXPathSelector(response)
+        links = xxs.select("//item/*[local-name()='origLink']/text()").extract()
+
+        return [Request(x, callback=self.parse_item) for x in links]
diff --git a/scrapy_proj/openrecipes/spiders/davidlebovitz_feedspider.py b/scrapy_proj/openrecipes/spiders/davidlebovitz_feedspider.py
@@ -0,0 +1,24 @@
+from scrapy.spider import BaseSpider
+from scrapy.http import Request
+from scrapy.selector import XmlXPathSelector
+from openrecipes.spiders.davidlebovitz_spider import DavidlebovitzMixin
+
+
+class DavidlebovitzfeedSpider(BaseSpider, DavidlebovitzMixin):
+
+    name = "davidlebovitz.feed"
+    allowed_domains = [
+        "davidlebovitz.com",
+        "feeds.feedburner.com",
+        "feedproxy.google.com"
+    ]
+    start_urls = [
+        "http://feeds.feedburner.com/davidlebovitz/blog",
+    ]
+
+    def parse(self, response):
+
+        xxs = XmlXPathSelector(response)
+        links = xxs.select("//item/*[local-name()='origLink']/text()").extract()
+
+        return [Request(x, callback=self.parse_item) for x in links]
diff --git a/scrapy_proj/openrecipes/spiders/fortheloveofcooking_feedspider.py b/scrapy_proj/openrecipes/spiders/fortheloveofcooking_feedspider.py
@@ -0,0 +1,24 @@
+from scrapy.spider import BaseSpider
+from scrapy.http import Request
+from scrapy.selector import XmlXPathSelector
+from openrecipes.spiders.fortheloveofcooking_spider import FortheloveofcookingMixin
+
+
+class FortheloveofcookingfeedSpider(BaseSpider, FortheloveofcookingMixin):
+
+    name = "fortheloveofcooking.feed"
+    allowed_domains = [
+        "fortheloveofcooking.net",
+        "feeds.feedburner.com",
+        "feedproxy.google.com"
+    ]
+    start_urls = [
+        "http://feeds.feedburner.com/blogspot/OlvyH",
+    ]
+
+    def parse(self, response):
+
+        xxs = XmlXPathSelector(response)
+        links = xxs.select("//item/*[local-name()='origLink']/text()").extract()
+
+        return [Request(x, callback=self.parse_item) for x in links]
diff --git a/scrapy_proj/openrecipes/spiders/marthastewart_spider.py b/scrapy_proj/openrecipes/spiders/marthastewart_spider.py
@@ -0,0 +1,77 @@
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.selector import HtmlXPathSelector
+from openrecipes.items import RecipeItem, RecipeItemLoader
+
+
+class MarthastewartMixin(object):
+    source = 'marthastewart'
+
+    def parse_item(self, response):
+
+        hxs = HtmlXPathSelector(response)
+
+        base_path = '//*[@class="hrecipe"]'
+
+        recipes_scopes = hxs.select(base_path)
+
+        name_path = '//*[@class="title fn"]/text()'
+        description_path = '//*[@class="expand-body"]/text()'
+        #formating odd, so much concatenate base url with image
+        image_path = 'concat("http://www.marthastewart.com", //*[@class="img-l photo"]/@src)'
+        prepTime_path = '//li[1]/span/span/@title'
+        cookTime_path = '//li[2]/span/span/@title'
+        recipeYield_path = '//*[@class="yield"]/text()[2]'
+        ingredients_path = '//*[@class="ingredient"]/text()'
+        datePublished = '//*[@class="recipe-info"]/cite/text()'
+
+        recipes = []
+
+        for r_scope in recipes_scopes:
+            il = RecipeItemLoader(item=RecipeItem())
+
+            il.add_value('source', self.source)
+
+            il.add_value('name', r_scope.select(name_path).extract())
+            il.add_value('image', r_scope.select(image_path).extract())
+            il.add_value('url', response.url)
+            il.add_value('description', r_scope.select(description_path).extract())
+
+            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
+            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
+            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())
+
+            ingredient_scopes = r_scope.select(ingredients_path)
+
+            ingredients = []
+            ingredient_scopes = r_scope.select(ingredients_path)
+            for ingredient_scope in ingredient_scopes:
+                ingredient = ingredient_scope.extract().strip()
+                if (ingredient):
+                    ingredients.append(ingredient)
+            il.add_value('ingredients', ingredients)
+
+            il.add_value('datePublished', r_scope.select(datePublished).extract())
+
+            recipes.append(il.load_item())
+
+        return recipes
+
+
+class MarthastewartcrawlSpider(CrawlSpider, MarthastewartMixin):
+
+    name = "www.marthastewart.com"
+
+    allowed_domains = ["www.marthastewart.com"]
+
+    start_urls = [
+        "http://www.marthastewart.com/search/apachesolr_search/recipe"
+
+    ]
+
+    rules = (
+        Rule(SgmlLinkExtractor(allow=('/search/apachesolr_search/recipe?page=/d+'))),
+
+        Rule(SgmlLinkExtractor(allow=('/\d+/[a-z-]+')),
+             callback='parse_item'),
+    )
diff --git a/scrapy_proj/openrecipes/spiders/mybakingaddiction_feedspider.py b/scrapy_proj/openrecipes/spiders/mybakingaddiction_feedspider.py
@@ -0,0 +1,25 @@
+from scrapy.spider import BaseSpider
+from scrapy.http import Request
+from scrapy.selector import XmlXPathSelector
+from openrecipes.spiders.mybakingaddiction_spider import MybakingaddictionMixin
+
+
+class MybakingaddictionfeedSpider(BaseSpider, MybakingaddictionMixin):
+
+    name = "mybakingaddiction.feed"
+    allowed_domains = [
+        "mybakingaddiction.com",
+        "feeds.feedburner.com",
+        "feedproxy.google.com"
+    ]
+    start_urls = [
+        "http://feeds.feedburner.com/mybakingaddiction",
+    ]
+
+    def parse(self, response):
+
+        xxs = XmlXPathSelector(response)
+        #check later
+        links = xxs.select("//item/*[local-name()='origLink']/text()").extract()
+
+        return [Request(x, callback=self.parse_item) for x in links]
diff --git a/scrapy_proj/openrecipes/spiders/simplyrecipe_feedspider.py b/scrapy_proj/openrecipes/spiders/simplyrecipe_feedspider.py
@@ -0,0 +1,24 @@
+from scrapy.spider import BaseSpider
+from scrapy.http import Request
+from scrapy.selector import XmlXPathSelector
+from openrecipes.spiders.simplyrecipes_spider import SimplyrecipesMixin
+
+
+class SimplyrecipesfeedSpider(BaseSpider, SimplyrecipesMixin):
+
+    name = "simplyrecipes.feed"
+    allowed_domains = [
+        "simplyrecipes.com",
+        "feeds.feedburner.com",
+        "feedproxy.google.com"
+    ]
+    start_urls = [
+        "http://feeds.feedburner.com/SimplyRecipesRecipesOnly",
+    ]
+
+    def parse(self, response):
+
+        xxs = XmlXPathSelector(response)
+        links = xxs.select("//item/*[local-name()='origLink']/text()").extract()
+
+        return [Request(x, callback=self.parse_item) for x in links]
diff --git a/...openrecipes/spiders/spider_name_spider.py → ...enrecipes/spiders/simplyrecipes_spider.py b/...openrecipes/spiders/spider_name_spider.py → ...enrecipes/spiders/simplyrecipes_spider.py
@@ -4,26 +4,25 @@
 from openrecipes.items import RecipeItem, RecipeItemLoader
 
 
-class Spider_nameMixin(object):
-    source = 'spider_name'
+class SimplyrecipesMixin(object):
+    source = 'simplyrecipes'
 
     def parse_item(self, response):
 
         hxs = HtmlXPathSelector(response)
 
-        base_path = 'TODO'
+        base_path = '//*[@itemtype="http://schema.org/Recipe"]'
 
         recipes_scopes = hxs.select(base_path)
 
-        name_path = 'TODO'
-        description_path = 'TODO'
-        image_path = 'TODO'
-        prepTime_path = 'TODO'
-        cookTime_path = 'TODO'
-        recipeYield_path = 'TODO'
-        ingredients_path = 'TODO'
-        datePublished = 'TODO'
-
+        name_path = '//*[@class="recipe-callout"]/h2/text()'
+        description_path = './/*[@id="recipe-intronote"]/p/text()'
+        image_path = '//*[@itemprop="image"]/@src'
+        prepTime_path = './/span[@itemprop="prepTime"]/span/@title'
+        cookTime_path = './/span[@itemprop="cookTime"]/span/@title'
+        recipeYield_path = '//*[@itemprop="recipeYield"]/text()'
+        ingredients_path = '//*[@itemprop="ingredients"]/text()'
+        datePublished = '//*[@class="entry-date"]/text()'
         recipes = []
 
         for r_scope in recipes_scopes:
@@ -40,10 +39,12 @@ def parse_item(self, response):
             il.add_value('cookTime', r_scope.select(cookTime_path).extract())
             il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())
 
-            ingredient_scopes = r_scope.select(ingredients_path)
             ingredients = []
-            for i_scope in ingredient_scopes:
-                pass
+            ingredient_scopes = r_scope.select(ingredients_path)
+            for ingredient_scope in ingredient_scopes:
+                ingredient = ingredient_scope.extract().strip()
+                if (ingredient):
+                    ingredients.append(ingredient)
             il.add_value('ingredients', ingredients)
 
             il.add_value('datePublished', r_scope.select(datePublished).extract())
@@ -53,19 +54,19 @@ def parse_item(self, response):
         return recipes
 
 
-class Spider_namecrawlSpider(CrawlSpider, Spider_nameMixin):
+class SimplyrecipescrawlSpider(CrawlSpider, SimplyrecipesMixin):
 
-    name = "START_URL"
+    name = "simplyrecipes.com"
 
-    allowed_domains = ["START_URL"]
+    allowed_domains = ["simplyrecipes.com"]
 
     start_urls = [
-        "START_URL",
+        "http://www.simplyrecipes.com/index/",
     ]
 
     rules = (
-        Rule(SgmlLinkExtractor(allow=('TODO'))),
+        Rule(SgmlLinkExtractor(allow=('/recipes/ingredient/[a-z-]+/'))),
 
-        Rule(SgmlLinkExtractor(allow=('TODO')),
+        Rule(SgmlLinkExtractor(allow=('/recipes/[a-z_]+/')),
              callback='parse_item'),
     )
diff --git a/scrapy_proj/openrecipes/spiders/twopeasandtheirpod_feedspider.py b/scrapy_proj/openrecipes/spiders/twopeasandtheirpod_feedspider.py
@@ -0,0 +1,25 @@
+from scrapy.spider import BaseSpider
+from scrapy.http import Request
+from scrapy.selector import XmlXPathSelector
+from openrecipes.spiders.twopeasandtheirpod_spider import TwopeasandtheirpodMixin
+
+
+class TwopeasandtheirpodfeedSpider(BaseSpider, TwopeasandtheirpodMixin):
+
+    name = "twopeasandtheirpod.feed"
+
+    allowed_domains = [
+        "twopeasandtheirpod.com",
+        "feeds.feedburner.com",
+        "feedproxy.google.com"
+    ]
+    start_urls = [
+        "http://feeds.feedburner.com/twopeasandtheirpod/rNNF",
+    ]
+
+    def parse(self, response):
+
+        xxs = XmlXPathSelector(response)
+        links = xxs.select("//item/*[local-name()='origLink']/text()").extract()
+
+        return [Request(x, callback=self.parse_item) for x in links]