Skip to content

Commit

Permalink
Merge pull request #44 from openeduhub/sis_language_fix
Browse files Browse the repository at this point in the history
science_in_school_spider v0.0.2 ("general.language" fix)
  • Loading branch information
torsten-simon authored Jul 4, 2022
2 parents 6846ac5 + b0c721f commit a2f2c8b
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 11 deletions.
3 changes: 2 additions & 1 deletion converter/spiders/sample_spider_alternative.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader:
# - title required
# - keyword required
# - description required
# - language recommended
# - language recommended (edu-sharing expects underscores in language-codes, e.g. 'en-US'
# needs to be replaced by 'en_US')
# - coverage optional
# - structure optional
# - aggregationLevel optional
Expand Down
22 changes: 12 additions & 10 deletions converter/spiders/science_in_school_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,15 @@ class ScienceInSchoolSpider(scrapy.Spider, LomBase):
start_urls = [
"https://www.scienceinschool.org/issue/"
]
version = "0.0.1"
version = "0.0.2" # last update: 2022-07-01
custom_settings = {
"AUTOTHROTTLE_ENABLED": True,
"AUTOTHROTTLE_DEBUG": True
}
allowed_domains = [
"scienceinschool.org"
]
DEBUG_ALL_ARTICLE_URLS = set()
DEBUG_LANGUAGES_AVAILABLE = set()
ALL_ARTICLE_URLS = set()

TOPICS_TO_DISCIPLINES_MAPPING = {
"Astronomy / space": "Astronomy",
Expand Down Expand Up @@ -82,8 +81,8 @@ def parse_article_overview(self, response: scrapy.http.Response) -> scrapy.Reque
"""
article_urls = response.xpath('//h3[@class="vf-card__heading"]/a[@class="vf-card__link"]/@href').getall()
# self.logger.info(f"Currently on {response.url} // Found {len(article_urls)} individual articles")
self.DEBUG_ALL_ARTICLE_URLS.update(article_urls)
# self.logger.info(f"Total URLs gathered so far: {len(self.DEBUG_ALL_ARTICLE_URLS)}")
self.ALL_ARTICLE_URLS.update(article_urls)
# self.logger.info(f"Total URLs gathered so far: {len(self.ALL_ARTICLE_URLS)}")
for article_url in article_urls:
yield scrapy.Request(url=article_url, callback=self.parse)
pass
Expand Down Expand Up @@ -112,10 +111,10 @@ def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader:
# on the left side of each article is a list of "Available languages", which holds URLs to all available
# versions of the (currently visited) article, including its own URL. We need to make sure that we're only
# gathering URLs that haven't been parsed before:
# self.logger.info(f"Before gathering article translations: {len(self.DEBUG_ALL_ARTICLE_URLS)}")
# self.logger.info(f"Before gathering article translations: {len(self.ALL_ARTICLE_URLS)}")
if multilanguage_article_list:
for article_translation_url in multilanguage_article_list:
if article_translation_url not in self.DEBUG_ALL_ARTICLE_URLS:
if article_translation_url not in self.ALL_ARTICLE_URLS:
# making sure we're not parsing translated articles more than once or causing loops
if article_translation_url.endswith('.pdf'):
# skipping direct-links to .pdf files because scrapy / splash can't handle these
Expand All @@ -126,9 +125,9 @@ def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader:
continue
else:
yield scrapy.Request(url=article_translation_url, callback=self.parse)
self.DEBUG_ALL_ARTICLE_URLS.update(multilanguage_article_list)
self.ALL_ARTICLE_URLS.update(multilanguage_article_list)
# self.logger.info(f"This message should still be appearing after fetching article translations. URLs gathered "
# f"so far: {len(self.DEBUG_ALL_ARTICLE_URLS)}")
# f"so far: {len(self.ALL_ARTICLE_URLS)}")

title: str = response.xpath('//meta[@property="og:title"]/@content').get()
if title is None:
Expand Down Expand Up @@ -251,7 +250,10 @@ def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader:
if description:
general.add_value('description', description)
if language:
general.add_value('language', language)
for language_item in language:
# edu-sharing expects the base.language value to be using underscores
language_underscore: str = language_item.replace('-', '_')
general.add_value('language', language_underscore)
# depending on the article language, we're creating sub-folders within edu-sharing:
# SYNC_OBJ/science_in_school_spider/<language_code>/
base.add_value('origin', language)
Expand Down

0 comments on commit a2f2c8b

Please sign in to comment.