From 17590c77c03f2cfd4aad1c77d82d4d788eb881ef Mon Sep 17 00:00:00 2001
From: KyriakosFrang <kyriakos.frangeskos@gmail.com>
Date: Fri, 3 Mar 2017 16:55:24 +0100
Subject: [PATCH] Handle already downloaded papers

---
 dblp_xml_processing.py | 150 ++++++++++++++++++++---------------------
 1 file changed, 75 insertions(+), 75 deletions(-)

diff --git a/dblp_xml_processing.py b/dblp_xml_processing.py
index 5bed662..478dd18 100644
--- a/dblp_xml_processing.py
+++ b/dblp_xml_processing.py
@@ -309,85 +309,85 @@ def download_and_store(self, paper, db):
           else:
             db.downloads.replace_one({'_id': downloadinfo['_id']}, downloadinfo)
             skip = True
-      else:
-        print("{} already in DB".format(paper['dblpkey']))
-        skip = True  # already exist in the db
+        else:
+          print("{} already in DB".format(paper['dblpkey']))
+          skip = True  # already exist in the db
 
         # Do the Download and store to MongoDB
-      if not skip:
-        try:
-
-          # download based on type. IMPORTANT: Add supported types here, and also a few lines above!
-          if paper['ee'].lower().endswith("pdf") and "pdf" in self.enabledScrapers:
-            # Normal PDF download
-            self.newPapersIn = True  # There are new additions
-            skipped = not tools.downloadFile(downloadinfo['url'], overwrite=False, folder=cfg.folder_pdf,
-                                               localfilename=filename)
-
-          elif "springer" in actual_url:
-            # go to springer crawller
-            self.newPapersIn = True  # There are new additions
-            global num_of_access_in_springer
-            num_of_access_in_springer += 1
-            print("{}, publisher: Springer, #Access: {}".format(paper['dblpkey'], num_of_access_in_springer))
-            skipped = not self.extract_paper_from_SPRINGER(url_open, filename)
-
-          elif "acm" in actual_url:
-            # go to acm crawler
-            self.newPapersIn = True  # There are new additions
-            global num_of_access_in_acm
-            num_of_access_in_acm += 1
-            print("{}, publisher: ACM, #Access: {}".format(paper['dblpkey'], num_of_access_in_acm))
-            skipped = not self.extract_paper_from_ACM(url_open, filename)
-
-          elif "ieee" in actual_url:
-            # go to ieee crawler
-            self.newPapersIn = True  # There are new additions
-            global num_of_access_in_ieee
-            num_of_access_in_ieee += 1
-            print("{}, publisher: IEEE, #Access: {}".format(paper['dblpkey'], num_of_access_in_ieee))
-            skipped = not self.extract_paper_from_IEEE(url_open, filename)
-
-          elif paper['ee'].startswith("http://www.aaai.org"):
-            # go to aaai crawler
-            self.newPapersIn = True  # There are new additions
-            global num_of_access_in_aaai
-            num_of_access_in_aaai += 1
-            print("{}, publisher: AAAI, #Access: {}".format(paper['dblpkey'], num_of_access_in_aaai))
-            skipped = not self.extract_paper_from_AAAI(actual_url, filename)
-
-          elif paper['ee'].startswith("http://www.icwsm.org"):
-            # got to icwsm crawler
-            self.newPapersIn = True  # There are new additions
-            global num_of_access_in_icwsm
-            num_of_access_in_icwsm += 1
-            print("{}, publisher: ICWSM, #Access: {}".format(paper['dblpkey'], num_of_access_in_icwsm))
-            skipped = not self.extract_paper_from_ICWSM(paper['ee'], filename)
-
-          else:
-            skipped = True
-
-          if skipped:
-            logging.info(' Used local PDF copy for ' + paper['dblpkey'])
-          else:
-            logging.info(' Downloaded ' + paper['dblpkey'])
-            # global numOfPDFobtainedInThisSession
-            numOfPDFobtainedInThisSession += 1
-            # store
+        if not skip:
+          try:
+
+            # download based on type. IMPORTANT: Add supported types here, and also a few lines above!
+            if paper['ee'].lower().endswith("pdf") and "pdf" in self.enabledScrapers:
+              # Normal PDF download
+              self.newPapersIn = True  # There are new additions
+              skipped = not tools.downloadFile(downloadinfo['url'], overwrite=False, folder=cfg.folder_pdf,
+                                                 localfilename=filename)
+
+            elif "springer" in actual_url:
+              # go to springer crawller
+              self.newPapersIn = True  # There are new additions
+              global num_of_access_in_springer
+              num_of_access_in_springer += 1
+              print("{}, publisher: Springer, #Access: {}".format(paper['dblpkey'], num_of_access_in_springer))
+              skipped = not self.extract_paper_from_SPRINGER(url_open, filename)
+
+            elif "acm" in actual_url:
+              # go to acm crawler
+              self.newPapersIn = True  # There are new additions
+              global num_of_access_in_acm
+              num_of_access_in_acm += 1
+              print("{}, publisher: ACM, #Access: {}".format(paper['dblpkey'], num_of_access_in_acm))
+              skipped = not self.extract_paper_from_ACM(url_open, filename)
+
+            elif "ieee" in actual_url:
+              # go to ieee crawler
+              self.newPapersIn = True  # There are new additions
+              global num_of_access_in_ieee
+              num_of_access_in_ieee += 1
+              print("{}, publisher: IEEE, #Access: {}".format(paper['dblpkey'], num_of_access_in_ieee))
+              skipped = not self.extract_paper_from_IEEE(url_open, filename)
+
+            elif paper['ee'].startswith("http://www.aaai.org"):
+              # go to aaai crawler
+              self.newPapersIn = True  # There are new additions
+              global num_of_access_in_aaai
+              num_of_access_in_aaai += 1
+              print("{}, publisher: AAAI, #Access: {}".format(paper['dblpkey'], num_of_access_in_aaai))
+              skipped = not self.extract_paper_from_AAAI(actual_url, filename)
+
+            elif paper['ee'].startswith("http://www.icwsm.org"):
+              # got to icwsm crawler
+              self.newPapersIn = True  # There are new additions
+              global num_of_access_in_icwsm
+              num_of_access_in_icwsm += 1
+              print("{}, publisher: ICWSM, #Access: {}".format(paper['dblpkey'], num_of_access_in_icwsm))
+              skipped = not self.extract_paper_from_ICWSM(paper['ee'], filename)
+
+            else:
+              skipped = True
+
+            if skipped:
+              logging.info(' Used local PDF copy for ' + paper['dblpkey'])
+            else:
+              logging.info(' Downloaded ' + paper['dblpkey'])
+              # global numOfPDFobtainedInThisSession
+              numOfPDFobtainedInThisSession += 1
+              # store
+              if self.storeToMongo:
+                # set additional data
+                paper['_id'] = paper['dblpkey']
+                # store to mongo
+                db.publications.replace_one({'_id': paper['_id']}, paper, upsert=True)
+                db.downloads.replace_one({'_id': downloadinfo['_id']}, downloadinfo, upsert=True)
+          except BaseException:
+            logging.exception('Cannot download or store ' + paper['ee'] + " with dblpkey: " + paper['dblpkey'],
+                                exc_info=True)
             if self.storeToMongo:
-              # set additional data
-              paper['_id'] = paper['dblpkey']
-              # store to mongo
-              db.publications.replace_one({'_id': paper['_id']}, paper, upsert=True)
+              downloadinfo['success'] = False
+              ex = sys.exc_info()
+              downloadinfo['error'] = repr(ex)
               db.downloads.replace_one({'_id': downloadinfo['_id']}, downloadinfo, upsert=True)
-        except BaseException:
-          logging.exception('Cannot download or store ' + paper['ee'] + " with dblpkey: " + paper['dblpkey'],
-                              exc_info=True)
-          if self.storeToMongo:
-            downloadinfo['success'] = False
-            ex = sys.exc_info()
-            downloadinfo['error'] = repr(ex)
-            db.downloads.replace_one({'_id': downloadinfo['_id']}, downloadinfo, upsert=True)
 
 
   def extract_paper_from_ICWSM(self, req, filename):