Skip to content

Commit

Permalink
Handle already downloaded papers
Browse files Browse the repository at this point in the history
  • Loading branch information
KyriakosFrang committed Mar 4, 2017
1 parent 17590c7 commit 8c1a548
Showing 1 changed file with 51 additions and 43 deletions.
94 changes: 51 additions & 43 deletions dblp_xml_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ def download_and_store(self, paper, db):
:return:
"""
#global filters
global skip
# the ee XML tag indicates that this paper has some kind of source attached (this will usually be an URL)
if 'ee' in paper:
# Do we want to skip this file? There are lots of reasons, see below... Skipping means we will not try to download it
Expand Down Expand Up @@ -231,64 +232,68 @@ def download_and_store(self, paper, db):
}
print("Publication matched: " + str(paper["dblpkey"]))
skip = False
filename = paper['dblpkey'] + ".pdf"
req = ""
actual_url = ""
url_open = ""
#down_info = db.downloads.find_one({'dblpkey': paper['dblpkey']})

if self.skipPreviouslyAccessedURLs and self.storeToMongo:
result = db.downloads.find_one({'_id': downloadinfo['_id']})
if result is None:
skip = False
# if it wasn't successful try once more
elif result['success'] is False:
skip = False
else:
# skip = True
if result['success']:
skip = True
global numOfPDFobtained
global paperCounter
global numOfPDFobtainedInThisSession
numOfPDFobtained += 1
if numOfPDFobtained % self.statusEveryXdownloads is 0:
logging.info(
'DBLP XML PROGRESS: XML Paper Entries {} PDFs {} PDFs in this Session {} '.format(
paperCounter, numOfPDFobtained, numOfPDFobtainedInThisSession))
else:
skip = False # url not in download collection of mongo db

down_info = db.downloads.find_one({'dblpkey': paper['dblpkey']})
if (down_info is None) or (down_info['success'] is False):

if skip is False:

pub_info = db.publications.find_one({'dblpkey': paper['dblpkey']})
if pub_info is None:
skip = False
req = ""
actual_url = ""
url_open = ""

try:
req = Request(paper['ee'], headers={'User-Agent': 'Mozilla/5.0'})
url_open = urlopen(req)
if url_open.status != 200:
skip = True
raise BaseException("HTTPError {}".format(url_open.status))
else:
#if url_open.status != 200:
# skip = True
#raise BaseException("HTTPError {}".format(url_open.status))
#else:
# downloadinfo = {}
actual_url = url_open.geturl()
global num_of_access
# Here we need to add a time delay because we access the
# sleep for a random duration of time between 60 and 360 seconds
actual_url = url_open.geturl()
global num_of_access
# Here we need to add a time delay because we access the
# sleep for a random duration of time between 60 and 360 seconds

rndm_time = int(random.uniform(60, 360))
print(
rndm_time = int(random.uniform(60, 360))
print(
"Crawler sleeps for {} min - Times Access Repositories: {}".format(float(rndm_time / int(60)),
num_of_access))

num_of_access += 1
time.sleep(rndm_time)
if (paper['ee'].lower().endswith("pdf") and "pdf" in self.enabledScrapers) or (
num_of_access += 1
time.sleep(rndm_time)
if (paper['ee'].lower().endswith("pdf") and "pdf" in self.enabledScrapers) or (
"ieee" in str(actual_url)) or ("springer" in actual_url) or ("acm" in actual_url) or paper[
'ee'].startswith("http://www.aaai.org") or paper['ee'].startswith("http://www.icwsm.org"):
filename = paper['dblpkey'] + ".pdf"

# decide if we want to skip this entry (e.g., it has been accessed before and we are in the mood for skipping)
if self.skipPreviouslyAccessedURLs and self.storeToMongo:
result = db.downloads.find_one({'_id': downloadinfo['_id']})
if result is None:
skip = False
# if it wasn't successful try once more
elif not result['success']:
skip = False
else:
skip = True
if result['success']:
skip = True
global numOfPDFobtained
global paperCounter
global numOfPDFobtainedInThisSession
numOfPDFobtained += 1
if numOfPDFobtained % self.statusEveryXdownloads is 0:
logging.info(
'DBLP XML PROGRESS: XML Paper Entries {} PDFs {} PDFs in this Session {} '.format(
paperCounter, numOfPDFobtained, numOfPDFobtainedInThisSession))
else:
skip = False # url not in download collection of mongo db
else:
filename = paper['dblpkey'] + ".pdf"
skip = False
# decide if we want to skip this entry (e.g., it has been accessed before and we are in the mood for skipping)
else:
skip = True # this ee entry is not interesting to us
print("{}, Repository not supported: {}".format(paper['dblpkey'], actual_url))
downloadinfo['success'] = False
Expand All @@ -301,6 +306,7 @@ def download_and_store(self, paper, db):
logging.exception('Cannot download or store ' + paper['ee'] + " with dblpkey: " + paper['dblpkey'],
exc_info=True)
skip = True # error with the url_open so skip the download
print("first try catch!!! skip: {}".format(skip))
if self.storeToMongo:
downloadinfo['success'] = False
ex = sys.exc_info()
Expand All @@ -314,6 +320,7 @@ def download_and_store(self, paper, db):
skip = True # already exist in the db

# Do the Download and store to MongoDB
print("Proceed with: {} : the download and store: Skip: {}".format(paper['dblpkey'],skip))
if not skip:
try:

Expand Down Expand Up @@ -383,6 +390,7 @@ def download_and_store(self, paper, db):
except BaseException:
logging.exception('Cannot download or store ' + paper['ee'] + " with dblpkey: " + paper['dblpkey'],
exc_info=True)
print("second try catch")
if self.storeToMongo:
downloadinfo['success'] = False
ex = sys.exc_info()
Expand Down

0 comments on commit 8c1a548

Please sign in to comment.