Skip to content

Commit

Permalink
fix: crawl all categories
Browse files Browse the repository at this point in the history
  • Loading branch information
marzzzello committed Jul 24, 2021
1 parent dd19857 commit fd4f77d
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 10 deletions.
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,18 @@
[![commit-activity](https://img.shields.io/github/commit-activity/m/marzzzello/appstore_crawler.svg?style=for-the-badge)](https://img.shields.io/github/commit-activity/m/marzzzello/appstore_crawler.svg?style=for-the-badge)
[![Mastodon Follow](https://img.shields.io/mastodon/follow/103207?domain=https%3A%2F%2Fsocial.tchncs.de&logo=mastodon&style=for-the-badge)](https://social.tchncs.de/@marzzzello)

# Apple Appstore Crawler
# Apple App Store Crawler

This crawler is based on [scrapy](https://docs.scrapy.org/en/latest/) and can download the IDs of all apps in appstore.
This crawler is based on [scrapy](https://docs.scrapy.org/en/latest/) and can download the IDs of all apps in the Apple App Store.
It can also download the metadata for a list of IDs.

### Get IDs

The crawler uses `https://apps.apple.com/{country}/genre/ios/id36` to get the categories and IDs by crawling all categories, letters and pages.
Since the webserver has no rate limiting, it is not needed to set a delay. A full crawl needs about 20 minutes (10-15 pages/second).

```sh
scrapy crawl -L INFO appstore_ids -a saveurls=False -a country=us -a level=1 -O out_ids.jl
scrapy crawl -L INFO appstore_ids -a saveurls=False -a country=us -a level=0 -O out_ids.jl
```

Parameters:
Expand Down Expand Up @@ -100,3 +101,4 @@ DOWNLOAD_DELAY_IDS = 0.0
```

The default delays are tested and should work well.
With the amp multi method and default settings the retrieval of metadata for 1 million apps needs about 3 hours.
14 changes: 11 additions & 3 deletions appstore/spiders/appstore_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,9 @@ def parse_main(self, response):
)

main_categories_without_sub_urls = response.css('a.top-level-genre:only-child::attr(href)').getall()
# main_categories_with_sub = response.css('a.top-level-genre:not(:only-child)::attr(href)').getall()
main_categories_with_sub = response.css('a.top-level-genre:not(:only-child)::attr(href)').getall()
# subcategories = response.css('ul.top-level-subgenres a::attr(href)').getall()
sub_categories_urls = []

# main categorie that has multiple subcategories
for categorie in response.css('a.top-level-genre:not(:only-child)'):
Expand All @@ -71,6 +72,7 @@ def parse_main(self, response):
'url': subcat.attrib['href'],
}
)
sub_categories_urls.append(subcat.attrib['href'])

categories.append(
{
Expand All @@ -87,6 +89,12 @@ def parse_main(self, response):
for url in main_categories_without_sub_urls:
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_categorie)
for url in sub_categories_urls:
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_categorie)
for url in main_categories_with_sub:
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_categorie)

def parse_categorie(self, response):
cat_id = response.url.split('/id')[1]
Expand All @@ -110,7 +118,7 @@ def parse_categorie(self, response):
yield scrapy.Request(url, callback=self.parse_categorie_letter)

def parse_categorie_letter(self, response):
cat_id, end = response.url.split('/id')[1].split('?letter=')
cat_id, end = response.url.split('/id')[-1].split('?letter=')
if len(end) == 1:
letter = end
page = '0'
Expand All @@ -119,7 +127,7 @@ def parse_categorie_letter(self, response):

print(f'Parsing {cat_id} {letter} {page:>3};', end=' ')
print(
f'Done {num_fmt(self._pages):>4}/~15k pages, {num_fmt(self._apps):>5}/~1.6M apps ',
f'Done {num_fmt(self._pages):>4}/~20k pages, {num_fmt(self._apps):>5}/~2.3M apps ',
end='\r',
)

Expand Down
8 changes: 4 additions & 4 deletions appstore/spiders/appstore_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,13 +122,13 @@ def scrape_metadata(self):
base_url_ua = f'https://apps.apple.com/{self._country}/app/'
header_ua = {'User-Agent': self._UA}

while len(self._ids_amp) > 1 or (self._use_UA and len(self._ids_ua) > 1):
if self._use_UA and len(self._ids_ua) > 1:
while len(self._ids_amp) > 0 or (self._use_UA and len(self._ids_ua) > 0):
if self._use_UA and len(self._ids_ua) > 0:
app_id = self._ids_ua.pop()
url_ua = base_url_ua + 'id' + str(app_id)
yield scrapy.Request(url_ua, self.parse_ua, headers=header_ua)

if len(self._ids_amp) > 1:
if len(self._ids_amp) > 0:
if self._amp_single:
app_id = self._ids_amp.pop()
url_amp = base_url_amp + '/' + str(app_id) + '?' + self.get_params()
Expand Down Expand Up @@ -263,7 +263,7 @@ def get_params(self, ids={}):
'limit[reviews]': 20,
'l': self._locale,
}
if ids is {}:
if ids == {}:
del params['ids']
else:
del params['include']
Expand Down

0 comments on commit fd4f77d

Please sign in to comment.