dblp_xml_processing.py

import logging
import dateutil.parser
import config as cfg
from pyhelpers import tools
import plac
import json
tools.setup_logging(file_name="xml_processor.log")

import sys
from lxml import etree
import gzip
import datetime

# modules to extract acm papers
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
# import time to set a sleeping mode to avoid HTTP error: 503/403/303
import time
import random
import json
import re

# add the number of access in acm to set sleep mode
num_of_access_in_acm = 0
# add the number of access in springer to set sleep mode
num_of_access_in_springer = 0
# add the number of access in aaai to set sleep mode
num_of_access_in_aaai = 0
# add the number of access in icwsm site to set sleep mode
num_of_access_in_icwsm = 0
# add the number of access in ieee site to set sleep mode
num_of_access_in_ieee = 0

num_of_access = 0

numOfPDFobtained = 0
numOfPDFobtainedInThisSession = 0

class XmlProcessing:

  def __init__(self, booktitles, journals):
    """
    Initialize the xml processing thingy
    """
    # set to true if you want to persist to a local mongo DB (default connection)

    self.storeToMongo = cfg.storeToMongo

    if booktitles is None:
      # GET THE VENUES WE LIKE from config.py
      self.booktitles = cfg.booktitles
      print('Conference of Interest: {}'.format(cfg.booktitles))
    else:
      self.booktitles = booktitles
      print('Conference of Interest: {}'.format(self.booktitles))

    if journals is None:
      # GET THE VENUES WE LIKE from config.py
      self.journals = cfg.journals
      print('Journals of Interest: {}'.format(cfg.journals))
    else:
      self.journals = journals
      print('Journals of Interest: {}'.format(self.journals))


    # create all the  folders
    tools.create_all_folders()

    # initialize connection to local mongoDB, database is named pub
    if self.storeToMongo:
      self.db = tools.connect_to_mongo()
    else:
      self.db = None

    # set to true if you want to skip downloading EE entries (pdf URLs) which have been accessed before (either sucessfully or unsucessfully)
    # this only works if storeToMongo is set to True because the MongoDB must be accessed for that. (if you set storeToMongo to false, I will
    # just assume that MongoDB is simply not active / there
    self.skipPreviouslyAccessedURLs = cfg.skipPreviouslyAccessedURLs

    # if they are new additions of papers this variable will be True
    # if it is true then proceed to the text_extraction phase
    self.newPapersIn= False

    # the categories you are interested in
    self.CATEGORIES = cfg.CATEGORIES

    # the categories you are NOT interested in
    self.SKIP_CATEGORIES = cfg.SKIP_CATEGORIES

    # the fields which should be in your each data item / mongo entry
    self.DATA_ITEMS = cfg.DATA_ITEMS

    # prints out a progress every X attempted downloads (including skips which had been downloaded before)
    self.statusEveryXdownloads = cfg.statusEveryXdownloads
    self.statusEveryXxmlLoops = cfg.statusEveryXxmlLoops

    self.filters = {}

    # enabledScrapers = ["pdf", "acm", "springer", "ieee", "aaai", 'icwsm']
    self.enabledScrapers = {"pdf"}


    #tools.create_all_folders()
    # just a counter
    global numOfPDFobtained
    global numOfPDFobtainedInThisSession
    #numOfPDFobtained = 0
    #numOfPDFobtainedInThisSession = 0

    # In each update attempt the xml files are deleted
    # and downloaded again  overwrite= True
    # get xml files
    tools.downloadFileWithProgress('http://dblp.uni-trier.de/xml/dblp.xml.gz', incrementKB=10 * 1024,
                                   folder=cfg.folder_dblp_xml, overwrite=cfg.overwriteDBLP_XML)
    tools.downloadFileWithProgress('http://dblp.uni-trier.de/xml/dblp.dtd', incrementKB=10 * 1024,
                                   folder=cfg.folder_dblp_xml, overwrite=cfg.overwriteDBLP_XML)

    # open xml and iterate over xml tree to extract relevant stuff
    with gzip.open(cfg.folder_dblp_xml + "dblp.xml.gz", 'rb') as file:
      context = etree.iterparse(file, dtd_validation=True, events=("start", "end"))
      self.fast_iter2(context, self.db)

  def clear_element(self, element):
    """
    helper just used for parsing the XML
    :param element:
    :return:
    """
    element.clear()
    while element.getprevious() is not None:
      del element.getparent()[0]

  def extract_paper_elements(self, context):
    """
    helper just used for parsing the XML
    :param context:
    :return:
    """
    for event, element in context:
      if element.tag in self.CATEGORIES:
        yield element
        self.clear_element(element)

  def fast_iter2(self, context, db):
    """
    main XML parser: iterates over the xml and extracts relevant info
    :param context:
    :param db:
    :return:
    """
    global paperCounter
    try:
      for paperCounter, element in enumerate(self.extract_paper_elements(context)):
        # extract basic metadata from the dblp XML
        paper = {
          'type': element.tag
          # 'mdate': element.get("mdate"),
        }
        if 'key' in element.attrib:
          paper['dblpkey'] = element.get('key', default=None)
        else:
          paper['dblpkey'] = None
        if paper['dblpkey'] is not None:
          paper['dblpkey'] = tools.normalizeDBLPkey(paper['dblpkey'])

        paper['authors'] = [author.text for author in element.findall("author")]
        for data_item in self.DATA_ITEMS:
          data = element.find(data_item)
          if data is not None:
            paper[data_item] = data.text

        if paper['type'] not in self.SKIP_CATEGORIES:
          # try to download and store the thing if it is not in one of the skipped categories
          self.download_and_store(paper, db)
        # print(paperCounter, paperCounter % statusEveryXxmlLoops)
        if (paperCounter % self.statusEveryXxmlLoops) == 0:
          print('.', end="")
          sys.stdout.flush()
    except etree.XMLSyntaxError:
      print("End of XML file")


  def download_and_store(self, paper, db):
    """
    stores stuff in mongo db, and downloads the PDF
    :param paper:
    :param db:
    :return:
    """
    #global filters

    global skip
    # the ee XML tag indicates that this paper has some kind of source attached (this will usually be an URL)
    if 'ee' in paper:
      # Do we want to skip this file? There are lots of reasons, see below... Skipping means we will not try to download it

      skip = False
      # filters have been set
      """
      if len(filters) > 0:
        for k, v in filters.items():
          if k == 'scraper':
            self.enabledScrapers.add(v)
            continue
          if not (k in paper and paper[k] == v):
            skip = True
        if not skip:
          if "dblpkey" in paper:
            print("Filter matched: " + str(paper["dblpkey"]))
      """


      # do NOT skip if paper has a key, an ee entry
      if (not skip and type(paper['dblpkey']) is str and type(paper['ee']) is str) and \
        (('booktitle' in paper and paper['booktitle'] in self.booktitles) or
           ('journal' in paper and paper['journal'] in self.journals)):
        # check if it one of our supported types. IMPORTANT: ADD NEW TYPES HERE IF WE HAVE THEM!
        # also here we are checking if the resolved doi belonges in to one of our crawlers
        # if yes then we proceed with the download otherwise we store the url in a file
        # with the not-supported repositories

        ## check if the paper was already succefully downloaded
        # downloadinfo is the dictionary which is later stored in the Mongo "downloads" collection to memorize
        # which URLs have been accessed, and if that was successfull or not
        downloadinfo = {
          '_id': paper['ee'],
          'url': paper['ee'],
          'dblpkey': paper['dblpkey'],
          'lastaccessed': datetime.datetime.now(),
          'success': True
        }
        print("Publication matched: " + str(paper["dblpkey"]))
        skip = False
        filename = paper['dblpkey'] + ".pdf"
        req = ""
        actual_url = ""
        url_open = ""
        #down_info = db.downloads.find_one({'dblpkey': paper['dblpkey']})

        if self.skipPreviouslyAccessedURLs and self.storeToMongo:
          result = db.downloads.find_one({'_id': downloadinfo['_id']})


          if result is None:
            skip = False
          # if it wasn't successful try once more
          elif result['success'] is False:
            last_access = result['lastaccessed']
            current_Date = downloadinfo['lastaccessed']
            days_previous_check = (current_Date - last_access).days
            skip = True
            ####### change it later!!!!!!
            if days_previous_check >= 30:
              skip = False
              print()
              print("Paper: {}, Last Check: {} days ago!".format(paper['dblpkey'], days_previous_check))
              print()

          # check if the download date was greater than 30 days

         #elif number_of_Days < 2:
         #  skip = False
          else:
            skip = True
            if result['success']:
              skip = True
              global numOfPDFobtained
              global paperCounter
              global numOfPDFobtainedInThisSession
              numOfPDFobtained += 1
              if numOfPDFobtained % self.statusEveryXdownloads is 0:
                logging.info(
                  'DBLP XML PROGRESS: XML Paper Entries {}      PDFs {}     PDFs in this Session {} '.format(
                    paperCounter, numOfPDFobtained, numOfPDFobtainedInThisSession))
        else:
          skip = False  # url not in download collection of mongo db


        if skip is False:

          pub_info = db.publications.find_one({'dblpkey': paper['dblpkey']})
          if pub_info is None:
            skip = False

            try:
              req = Request(paper['ee'], headers={'User-Agent': 'Mozilla/5.0'})
              url_open = urlopen(req)
              #if url_open.status != 200:
              #  skip = True
                #raise BaseException("HTTPError {}".format(url_open.status))
              #else:
                # downloadinfo = {}
              actual_url = url_open.geturl()
              global num_of_access
              # Here we need to add a time delay because we access the
              # sleep for a random duration of time between 60 and 360 seconds

              rndm_time = int(random.uniform(60, 360))
              print(
                  "Crawler sleeps for {} min - Times Access Repositories: {}".format(float(rndm_time / int(60)),
                                                                                       num_of_access))

              num_of_access += 1
              time.sleep(rndm_time)
              if (paper['ee'].lower().endswith("pdf") and "pdf" in self.enabledScrapers) or (
                      "ieee" in str(actual_url)) or ("springer" in actual_url) or ("acm" in actual_url) or paper[
                    'ee'].startswith("http://www.aaai.org") or paper['ee'].startswith("http://www.icwsm.org"):
                filename = paper['dblpkey'] + ".pdf"
                skip = False
                # decide if we want to skip this entry (e.g., it has been accessed before and we are in the mood for skipping)
              else:
                  skip = True  # this ee entry is not interesting to us
                  print("{}, Repository not supported: {}".format(paper['dblpkey'], actual_url))
                  downloadinfo['success'] = False
                  downloadinfo['error'] = "{}, Repository not supported: {}".format(paper['dblpkey'], actual_url)
                  db.downloads.replace_one({'_id': downloadinfo['_id']}, downloadinfo, upsert=True)
                  with open(cfg.folder_log + "not_supported_repos.txt", 'a', encoding='UTF-8') as f:
                    f.write(actual_url)
                    f.write("\n")
            except BaseException:
              logging.exception('Cannot download or store ' + paper['ee'] + " with dblpkey: " + paper['dblpkey'],
                                  exc_info=True)
              skip = True  # error with the url_open so skip the download
              print("first try catch!!! skip: {}".format(skip))
              if self.storeToMongo:
                downloadinfo['success'] = False
                ex = sys.exc_info()
                downloadinfo['error'] = repr(ex)
                db.downloads.replace_one({'_id': downloadinfo['_id']}, downloadinfo, upsert=True)
          else:
            db.downloads.replace_one({'_id': downloadinfo['_id']}, downloadinfo)
            skip = True
        else:
          print("{} already in DB".format(paper['dblpkey']))
          skip = True  # already exist in the db

        # Do the Download and store to MongoDB
        #print("Proceed with: {} : the download and store: Skip: {}".format(paper['dblpkey'],skip))
        if not skip:
          try:

            # download based on type. IMPORTANT: Add supported types here, and also a few lines above!
            if paper['ee'].lower().endswith("pdf") and "pdf" in self.enabledScrapers:
              # Normal PDF download
              self.newPapersIn = True  # There are new additions
              skipped = not tools.downloadFile(downloadinfo['url'], overwrite=False, folder=cfg.folder_pdf,
                                                 localfilename=filename)

            elif "springer" in actual_url:
              # go to springer crawller
              self.newPapersIn = True  # There are new additions
              global num_of_access_in_springer
              num_of_access_in_springer += 1
              print("{}, publisher: Springer, #Access: {}".format(paper['dblpkey'], num_of_access_in_springer))
              skipped = not self.extract_paper_from_SPRINGER(url_open, filename)

            elif "acm" in actual_url:
              # go to acm crawler
              self.newPapersIn = True  # There are new additions
              global num_of_access_in_acm
              num_of_access_in_acm += 1
              print("{}, publisher: ACM, #Access: {}".format(paper['dblpkey'], num_of_access_in_acm))
              skipped = not self.extract_paper_from_ACM(url_open, filename)

            elif "ieee" in actual_url:
              # go to ieee crawler
              self.newPapersIn = True  # There are new additions
              global num_of_access_in_ieee
              num_of_access_in_ieee += 1
              print("{}, publisher: IEEE, #Access: {}".format(paper['dblpkey'], num_of_access_in_ieee))
              skipped = not self.extract_paper_from_IEEE(url_open, filename)

            elif paper['ee'].startswith("http://www.aaai.org"):
              # go to aaai crawler
              self.newPapersIn = True  # There are new additions
              global num_of_access_in_aaai
              num_of_access_in_aaai += 1
              print("{}, publisher: AAAI, #Access: {}".format(paper['dblpkey'], num_of_access_in_aaai))
              skipped = not self.extract_paper_from_AAAI(actual_url, filename)

            elif paper['ee'].startswith("http://www.icwsm.org"):
              # got to icwsm crawler
              self.newPapersIn = True  # There are new additions
              global num_of_access_in_icwsm
              num_of_access_in_icwsm += 1
              print("{}, publisher: ICWSM, #Access: {}".format(paper['dblpkey'], num_of_access_in_icwsm))
              skipped = not self.extract_paper_from_ICWSM(paper['ee'], filename)

            else:
              skipped = True

            if skipped:
              logging.info(' Used local PDF copy for ' + paper['dblpkey'])
            else:
              logging.info(' Downloaded ' + paper['dblpkey'])
              # global numOfPDFobtainedInThisSession
              numOfPDFobtainedInThisSession += 1
              # store
              if self.storeToMongo:
                # set additional data
                paper['_id'] = paper['dblpkey']
                # store to mongo
                db.publications.replace_one({'_id': paper['_id']}, paper, upsert=True)
                db.downloads.replace_one({'_id': downloadinfo['_id']}, downloadinfo, upsert=True)
          except BaseException:
            logging.exception('Cannot download or store ' + paper['ee'] + " with dblpkey: " + paper['dblpkey'],
                                exc_info=True)
            print("second try catch")
            if self.storeToMongo:
              downloadinfo['success'] = False
              ex = sys.exc_info()
              downloadinfo['error'] = repr(ex)
              db.downloads.replace_one({'_id': downloadinfo['_id']}, downloadinfo, upsert=True)


  def extract_paper_from_ICWSM(self, req, filename):
    """
      this function will access a given url  and will find the link of the pdf.
      Attention: WORKS ONLY IN THE TU DELFT NETWORK or VPN
      :param paper_url: e.g. "http://www.icwsm.org/papers/paper54.html"
      :return:
      """
    # reguest to the url, add headers to avoid  HTTP Error: 403 Forbidden
    # the site will strike you out because you are a robot!
    # req = Request(paper_url ,headers={'User-Agent': 'Mozilla/5.0'})
    webpage = req.read()
    # parse the html code
    soup = BeautifulSoup(webpage, 'html.parser')
    # select only the link tags
    for link in soup.find_all('a', href=True, text='PDF'):
      # the name of in the link tag is "FullTextPDF"
      prefix = "http://www.icwsm.org/papers/"
      suffix = link.get("href")
      pdf_link = prefix + suffix
      print("Access in " + pdf_link)
      return tools.downloadFile(url=pdf_link, folder=cfg.folder_pdf, overwrite=False,
                                localfilename=filename, printOutput=False)
    raise BaseException(req.geturl() + ' does not contain a valid ICWSM download link.')


  def extract_paper_from_AAAI(self, req, filename):
    """
      this function will access a given url  and will find the link of the pdf.
      Attention: WORKS ONLY IN THE TU DELFT NETWORK or VPN
      :param paper_url: e.g. "http://www.aaai.org/ocs/index.php/ICWSM/ICWSM16/paper/view/13130"
      :return:
      """
    # reguest to the url, add headers to avoid  HTTP Error: 403 Forbidden
    # the site will strike you out because you are a robot!
    paper_url = req
    if "viewPaper" not in paper_url:
      paper_url = paper_url.replace("view", "viewPaper")

    req = Request(paper_url, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    # parse the html code
    soup = BeautifulSoup(webpage, 'html.parser')
    # select only the link tags
    for link in soup.find_all('a', href=True, text='PDF'):
      # the name of in the link tag is "FullTextPDF"
      pdf_link = link.get("href").replace("view", "download")
      print("Access in " + pdf_link)
      return tools.downloadFile(url=pdf_link, folder=cfg.folder_pdf, overwrite=False,
                                localfilename=filename, printOutput=False)
    raise BaseException(paper_url + ' does not contain a valid AAAI download link.')


  # python3 dblp_xml_processing.py -filter="{'booktitle' : 'ICSE', 'scraper' : 'acm'}"
  def extract_paper_from_IEEE(self,req, filename):
    """
      this function will access a given url  and will find the link of the pdf.
      Attention: WORKS ONLY IN THE TU DELFT NETWORK or VPN
      :param paper_url: e.g. "http://dx.doi.org/10.1007/BF00264597"
      :return:
      """
    # reguest to the url, add headers to avoid  HTTP Error: 403 Forbidden
    # the site will strike you out because you are a robot!

    webpage = req.read()
    # parse the html code
    soup = BeautifulSoup(webpage, 'html.parser')
    menus = json.loads(re.search(r"global.document.metadata\s*=\s*(.*);", soup.getText()).group(1))
    pdfpath = str(menus['pdfPath']).replace("iel", "ielx")
    pdf_link = "http://ieeexplore.ieee.org" + pdfpath

    print("Access in " + pdf_link)
    return tools.downloadFile(url=pdf_link, folder=cfg.folder_pdf, overwrite=False,
                              localfilename=filename, printOutput=False)
    # raise BaseException(req.geturl() + ' does not contain a valid IEEE download link.')


  def extract_paper_from_SPRINGER(self, req, filename):
    """
      this function will access a given url  and will find the link of the pdf.
      Attention: WORKS ONLY IN THE TU DELFT NETWORK or VPN
      :param paper_url: e.g. "http://dx.doi.org/10.1007/BF00264597"
      :return:
      """
    # reguest to the url, add headers to avoid  HTTP Error: 403 Forbidden
    # the site will strike you out because you are a robot!
    # req = Request(paper_url, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = req.read()
    # parse the html code
    soup = BeautifulSoup(webpage, 'html.parser')
    # select only the link tags
    for link in soup.find_all('a'):
      # the name of in the link tag is "FullTextPDF"
      if str(link.get('href')).endswith('.pdf'):
        # for instance : "/content/pdf/10.1007%2FBF00264597.pdf"
        href_link = link.get('href')
        prefix = "http://link.springer.com"
        pdf_link = prefix + href_link

        print("Access in " + pdf_link)
        return tools.downloadFile(url=pdf_link, folder=cfg.folder_pdf, overwrite=False,
                                  localfilename=filename, printOutput=False)
    raise BaseException(req.geturl() + ' does not contain a valid SPRINGER download link.')


  def extract_paper_from_ACM(self, req, filename):
    """
      this function will access a given url  and will find the link of the pdf.
      Attention: WORKS ONLY IN THE TU DELFT NETWORK or VPN
      :param paper_url: DOI link of and ACM Page
      :return:
      """
    # reguest to the url, add headers to avoid  HTTP Error: 403 Forbidden
    # the site will strike you out because you are a robot!
    # req = Request(paper_url ,headers={'User-Agent': 'Mozilla/5.0'})
    webpage = req.read()
    # parse the html code
    soup = BeautifulSoup(webpage, 'html.parser')
    # select only the link tags
    for link in soup.find_all('a'):
      # the name of in the link tag is "FullTextPDF"
      if str(link.get('name')).endswith('PDF') or str(link.get('name')).endswith('Pdf') \
        or str(link.get('name')).endswith('pdf'):
        href_link = link.get('href')
        prefix = "http://dl.acm.org/"
        pdf_link = prefix + href_link
        # To avoid any conflicts I am taking the id of the link and the
        # ftid and I concatinate them together. I put +3 and +6 to
        # exclude  the "id=" and "ftid="
        pdf_id = href_link[href_link.find("id=") + 3: href_link.find("&f")]
        file_id = href_link[href_link.find("ftid=") + 6: href_link.find("&d")]
        # localfilename = pdf_id + '_' + file_id+'.pdf'
        # folder = "C:/Users/User/Documents/acm_pdfs/"
        print("donwload file " + pdf_link)
        return tools.downloadFile(url=pdf_link, folder=cfg.folder_pdf, overwrite=False,
                                  localfilename=filename, printOutput=False)
    raise BaseException(req.geturl() + ' does not contain a valid ACM download link.')

    #global filters
    #if filter:
    #  self.filters = json.loads(filter.replace("'", '"'))
    #  print("Using filters " + str(filters))