Merge pull request #44 from openeduhub/sis_language_fix

science_in_school_spider v0.0.2 ("general.language" fix)
hpi-schul-cloud · Jul 4, 2022 · a2f2c8b · a2f2c8b
2 parents 6846ac5 + b0c721f
commit a2f2c8b
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 11 deletions.
diff --git a/converter/spiders/sample_spider_alternative.py b/converter/spiders/sample_spider_alternative.py
@@ -82,7 +82,8 @@ def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader:
         #  - title                          required
         #  - keyword                        required
         #  - description                    required
-        #  - language                       recommended
+        #  - language                       recommended (edu-sharing expects underscores in language-codes, e.g. 'en-US'
+        #                                               needs to be replaced by 'en_US')
         #  - coverage                       optional
         #  - structure                      optional
         #  - aggregationLevel               optional

diff --git a/converter/spiders/science_in_school_spider.py b/converter/spiders/science_in_school_spider.py
@@ -17,16 +17,15 @@ class ScienceInSchoolSpider(scrapy.Spider, LomBase):
     start_urls = [
         "https://www.scienceinschool.org/issue/"
     ]
-    version = "0.0.1"
+    version = "0.0.2"  # last update: 2022-07-01
     custom_settings = {
         "AUTOTHROTTLE_ENABLED": True,
         "AUTOTHROTTLE_DEBUG": True
     }
     allowed_domains = [
         "scienceinschool.org"
     ]
-    DEBUG_ALL_ARTICLE_URLS = set()
-    DEBUG_LANGUAGES_AVAILABLE = set()
+    ALL_ARTICLE_URLS = set()
 
     TOPICS_TO_DISCIPLINES_MAPPING = {
         "Astronomy / space": "Astronomy",
@@ -82,8 +81,8 @@ def parse_article_overview(self, response: scrapy.http.Response) -> scrapy.Reque
         """
         article_urls = response.xpath('//h3[@class="vf-card__heading"]/a[@class="vf-card__link"]/@href').getall()
         # self.logger.info(f"Currently on {response.url} // Found {len(article_urls)} individual articles")
-        self.DEBUG_ALL_ARTICLE_URLS.update(article_urls)
-        # self.logger.info(f"Total URLs gathered so far: {len(self.DEBUG_ALL_ARTICLE_URLS)}")
+        self.ALL_ARTICLE_URLS.update(article_urls)
+        # self.logger.info(f"Total URLs gathered so far: {len(self.ALL_ARTICLE_URLS)}")
         for article_url in article_urls:
             yield scrapy.Request(url=article_url, callback=self.parse)
         pass
@@ -112,10 +111,10 @@ def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader:
         # on the left side of each article is a list of "Available languages", which holds URLs to all available
         # versions of the (currently visited) article, including its own URL. We need to make sure that we're only
         # gathering URLs that haven't been parsed before:
-        # self.logger.info(f"Before gathering article translations: {len(self.DEBUG_ALL_ARTICLE_URLS)}")
+        # self.logger.info(f"Before gathering article translations: {len(self.ALL_ARTICLE_URLS)}")
         if multilanguage_article_list:
             for article_translation_url in multilanguage_article_list:
-                if article_translation_url not in self.DEBUG_ALL_ARTICLE_URLS:
+                if article_translation_url not in self.ALL_ARTICLE_URLS:
                     # making sure we're not parsing translated articles more than once or causing loops
                     if article_translation_url.endswith('.pdf'):
                         # skipping direct-links to .pdf files because scrapy / splash can't handle these
@@ -126,9 +125,9 @@ def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader:
                         continue
                     else:
                         yield scrapy.Request(url=article_translation_url, callback=self.parse)
-            self.DEBUG_ALL_ARTICLE_URLS.update(multilanguage_article_list)
+            self.ALL_ARTICLE_URLS.update(multilanguage_article_list)
         # self.logger.info(f"This message should still be appearing after fetching article translations. URLs gathered "
-        #                  f"so far: {len(self.DEBUG_ALL_ARTICLE_URLS)}")
+        #                  f"so far: {len(self.ALL_ARTICLE_URLS)}")
 
         title: str = response.xpath('//meta[@property="og:title"]/@content').get()
         if title is None:
@@ -251,7 +250,10 @@ def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader:
         if description:
             general.add_value('description', description)
         if language:
-            general.add_value('language', language)
+            for language_item in language:
+                # edu-sharing expects the base.language value to be using underscores
+                language_underscore: str = language_item.replace('-', '_')
+                general.add_value('language', language_underscore)
             # depending on the article language, we're creating sub-folders within edu-sharing:
             # SYNC_OBJ/science_in_school_spider/<language_code>/
             base.add_value('origin', language)