diff --git a/README.md b/README.md index 279ff6c..feaeb2a 100644 --- a/README.md +++ b/README.md @@ -8,17 +8,18 @@ [![commit-activity](https://img.shields.io/github/commit-activity/m/marzzzello/appstore_crawler.svg?style=for-the-badge)](https://img.shields.io/github/commit-activity/m/marzzzello/appstore_crawler.svg?style=for-the-badge) [![Mastodon Follow](https://img.shields.io/mastodon/follow/103207?domain=https%3A%2F%2Fsocial.tchncs.de&logo=mastodon&style=for-the-badge)](https://social.tchncs.de/@marzzzello) -# Apple Appstore Crawler +# Apple App Store Crawler -This crawler is based on [scrapy](https://docs.scrapy.org/en/latest/) and can download the IDs of all apps in appstore. +This crawler is based on [scrapy](https://docs.scrapy.org/en/latest/) and can download the IDs of all apps in the Apple App Store. It can also download the metadata for a list of IDs. ### Get IDs The crawler uses `https://apps.apple.com/{country}/genre/ios/id36` to get the categories and IDs by crawling all categories, letters and pages. +Since the webserver has no rate limiting, it is not needed to set a delay. A full crawl needs about 20 minutes (10-15 pages/second). ```sh -scrapy crawl -L INFO appstore_ids -a saveurls=False -a country=us -a level=1 -O out_ids.jl +scrapy crawl -L INFO appstore_ids -a saveurls=False -a country=us -a level=0 -O out_ids.jl ``` Parameters: @@ -100,3 +101,4 @@ DOWNLOAD_DELAY_IDS = 0.0 ``` The default delays are tested and should work well. +With the amp multi method and default settings the retrieval of metadata for 1 million apps needs about 3 hours. diff --git a/appstore/spiders/appstore_ids.py b/appstore/spiders/appstore_ids.py index ab28dfa..64b052e 100644 --- a/appstore/spiders/appstore_ids.py +++ b/appstore/spiders/appstore_ids.py @@ -57,8 +57,9 @@ def parse_main(self, response): ) main_categories_without_sub_urls = response.css('a.top-level-genre:only-child::attr(href)').getall() - # main_categories_with_sub = response.css('a.top-level-genre:not(:only-child)::attr(href)').getall() + main_categories_with_sub = response.css('a.top-level-genre:not(:only-child)::attr(href)').getall() # subcategories = response.css('ul.top-level-subgenres a::attr(href)').getall() + sub_categories_urls = [] # main categorie that has multiple subcategories for categorie in response.css('a.top-level-genre:not(:only-child)'): @@ -71,6 +72,7 @@ def parse_main(self, response): 'url': subcat.attrib['href'], } ) + sub_categories_urls.append(subcat.attrib['href']) categories.append( { @@ -87,6 +89,12 @@ def parse_main(self, response): for url in main_categories_without_sub_urls: url = response.urljoin(url) yield scrapy.Request(url, callback=self.parse_categorie) + for url in sub_categories_urls: + url = response.urljoin(url) + yield scrapy.Request(url, callback=self.parse_categorie) + for url in main_categories_with_sub: + url = response.urljoin(url) + yield scrapy.Request(url, callback=self.parse_categorie) def parse_categorie(self, response): cat_id = response.url.split('/id')[1] @@ -110,7 +118,7 @@ def parse_categorie(self, response): yield scrapy.Request(url, callback=self.parse_categorie_letter) def parse_categorie_letter(self, response): - cat_id, end = response.url.split('/id')[1].split('?letter=') + cat_id, end = response.url.split('/id')[-1].split('?letter=') if len(end) == 1: letter = end page = '0' @@ -119,7 +127,7 @@ def parse_categorie_letter(self, response): print(f'Parsing {cat_id} {letter} {page:>3};', end=' ') print( - f'Done {num_fmt(self._pages):>4}/~15k pages, {num_fmt(self._apps):>5}/~1.6M apps ', + f'Done {num_fmt(self._pages):>4}/~20k pages, {num_fmt(self._apps):>5}/~2.3M apps ', end='\r', ) diff --git a/appstore/spiders/appstore_metadata.py b/appstore/spiders/appstore_metadata.py index bc9fc35..5f982cb 100644 --- a/appstore/spiders/appstore_metadata.py +++ b/appstore/spiders/appstore_metadata.py @@ -122,13 +122,13 @@ def scrape_metadata(self): base_url_ua = f'https://apps.apple.com/{self._country}/app/' header_ua = {'User-Agent': self._UA} - while len(self._ids_amp) > 1 or (self._use_UA and len(self._ids_ua) > 1): - if self._use_UA and len(self._ids_ua) > 1: + while len(self._ids_amp) > 0 or (self._use_UA and len(self._ids_ua) > 0): + if self._use_UA and len(self._ids_ua) > 0: app_id = self._ids_ua.pop() url_ua = base_url_ua + 'id' + str(app_id) yield scrapy.Request(url_ua, self.parse_ua, headers=header_ua) - if len(self._ids_amp) > 1: + if len(self._ids_amp) > 0: if self._amp_single: app_id = self._ids_amp.pop() url_amp = base_url_amp + '/' + str(app_id) + '?' + self.get_params() @@ -263,7 +263,7 @@ def get_params(self, ids={}): 'limit[reviews]': 20, 'l': self._locale, } - if ids is {}: + if ids == {}: del params['ids'] else: del params['include']