From bcf908a6463a0cadcb2f79ba8ef368b1666beb98 Mon Sep 17 00:00:00 2001 From: KyriakosFrang Date: Fri, 3 Mar 2017 15:13:22 +0100 Subject: [PATCH] Add OPTIONAL arguments to enable parallel pipelines python bacjground_process.py --conf SIGIR --journal Some_journal --- background_process.py | 84 +++++++++++++++++++++++++++--------------- classify_and_NEE.py | 22 +++++++++-- dblp_xml_processing.py | 22 ++++++++--- pdf_text_extractor.py | 23 ++++++++++-- 4 files changed, 109 insertions(+), 42 deletions(-) diff --git a/background_process.py b/background_process.py index 8647a61..e018d8d 100644 --- a/background_process.py +++ b/background_process.py @@ -41,11 +41,11 @@ def exist_papers_with_out_content(): @catch_exceptions(cancel_on_failure=False) def update_process(): - XmlProcessing() + XmlProcessing(booktitles=None, journals=None) if exist_papers_with_out_content(): # if there are papers with out content proceed to text extraction & classify_NEE - TextExtraction() - classify_and_NEEextraction() + TextExtraction(booktitles=None, journals=None) + classify_and_NEEextraction(booktitles=None, journals=None) else: print("No new paper additions!") @@ -54,40 +54,66 @@ def update_process(): def main(): # create all the necessary folders + import argparse + parser = argparse.ArgumentParser() - if cfg.updateNow: - update_process() + # optional parameters to increase the modularity of the script + # you can start multiple parallel scripts with different conferences or journals + parser.add_argument("--conf", help="Provide the (Only one) conference you like") + parser.add_argument("--journal", help="Provide the (Only one) journal you like") + args, leftovers = parser.parse_known_args() - if cfg.checkDaily: - # perform update every Day - schedule.every().day.at("18:00").do(update_process) + booktitles = None + journals = None - if cfg.checkWeekly: - # Perform update every Friday - schedule.every().friday.at("18:00").do(update_process) + if args.conf is None and args.journal is None: + print("No optional parameter proceed with configuration file") + if cfg.updateNow: + update_process() - # In order to perform separetly one of the three - # main phases, all the update features need to be False - if cfg.updateNow is False and cfg.checkDaily is False and cfg.checkWeekly is False: + if cfg.checkDaily: + # perform update every Day + schedule.every().day.at("18:00").do(update_process) - if cfg.only_pdf_download: - print("Perform XML processing!") - XmlProcessing() + if cfg.checkWeekly: + # Perform update every Friday + schedule.every().friday.at("18:00").do(update_process) + + # In order to perform separetly one of the three + # main phases, all the update features need to be False + if cfg.updateNow is False and cfg.checkDaily is False and cfg.checkWeekly is False: + + if cfg.only_pdf_download: + print("Perform XML processing!") + XmlProcessing(booktitles=None, journals=None) + + if cfg.only_text_extraction: + print("Perform Text Extraction processing!") + TextExtraction(booktitles=None, journals=None) + + if cfg.only_classify_nee: + print("Perform Rhetorical/Name entity extraction and classify!") + classify_and_NEEextraction(booktitles=None, journals=None) + + while True: + if not cfg.checkWeekly and not cfg.checkDaily: + break + else: + schedule.run_pending() + time.sleep(1) + + elif args.conf is not None or args.journal is not None: + if args.conf is not None: + booktitles = [str(args.conf)] + if args.journal is not None: + journals = [str(args.journal)] + + XmlProcessing(booktitles=booktitles, journals=journals) + TextExtraction(booktitles=booktitles, journals=journals) + classify_and_NEEextraction(booktitles=booktitles, journals=journals) - if cfg.only_text_extraction: - print("Perform Text Extraction processing!") - TextExtraction() - if cfg.only_classify_nee: - print("Perform Rhetorical/Name entity extraction and classify!") - classify_and_NEEextraction() - while True: - if not cfg.checkWeekly and not cfg.checkDaily: - break - else: - schedule.run_pending() - time.sleep(1) if __name__ == '__main__': main() diff --git a/classify_and_NEE.py b/classify_and_NEE.py index e70d1f8..683e3fe 100644 --- a/classify_and_NEE.py +++ b/classify_and_NEE.py @@ -10,12 +10,28 @@ class classify_and_NEEextraction: - def __init__(self): + def __init__(self, booktitles, journals): textrazor.api_key = "9f466f8622a88d099f740d54b435845746914cbc43c831652408a5eb" - self.booktitles = cfg.booktitles - self.journals = cfg.journals + #self.booktitles = cfg.booktitles + #self.journals = cfg.journals + + if booktitles is None: + # GET THE VENUES WE LIKE from config.py + self.booktitles = cfg.booktitles + print('Conference of Interest: {}'.format(cfg.booktitles)) + else: + self.booktitles = booktitles + print('Conference of Interest: {}'.format(self.booktitles)) + + if journals is None: + # GET THE VENUES WE LIKE from config.py + self.journals = cfg.journals + print('Journals of Interest: {}'.format(cfg.journals)) + else: + self.journals = journals + print('Journals of Interest: {}'.format(self.journals)) self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') diff --git a/dblp_xml_processing.py b/dblp_xml_processing.py index 4803e33..04acd55 100644 --- a/dblp_xml_processing.py +++ b/dblp_xml_processing.py @@ -37,7 +37,7 @@ class XmlProcessing: - def __init__(self): + def __init__(self, booktitles, journals): """ Initialize the xml processing thingy """ @@ -45,9 +45,21 @@ def __init__(self): self.storeToMongo = cfg.storeToMongo - print('Conference of Interest: {}'.format(cfg.booktitles)) - print('Journals of Interest: {}'.format(cfg.journals)) + if booktitles is None: + # GET THE VENUES WE LIKE from config.py + self.booktitles = cfg.booktitles + print('Conference of Interest: {}'.format(cfg.booktitles)) + else: + self.booktitles = booktitles + print('Conference of Interest: {}'.format(self.booktitles)) + if journals is None: + # GET THE VENUES WE LIKE from config.py + self.journals = cfg.journals + print('Journals of Interest: {}'.format(cfg.journals)) + else: + self.journals = journals + print('Journals of Interest: {}'.format(self.journals)) # create all the folders @@ -88,9 +100,7 @@ def __init__(self): - # GET THE VENUES WE LIKE - self.booktitles = cfg.booktitles - self.journals = cfg.journals + #tools.create_all_folders() # just a counter diff --git a/pdf_text_extractor.py b/pdf_text_extractor.py index 67a75ee..87c5aa1 100644 --- a/pdf_text_extractor.py +++ b/pdf_text_extractor.py @@ -14,13 +14,28 @@ class TextExtraction: - def __init__(self): + def __init__(self, booktitles, journals): # The booktitles are located in the config.py # If you are interested in specific conference just add it there - self.booktitles = cfg.booktitles - self.journals = cfg.journals - + #self.booktitles = cfg.booktitles + #self.journals = cfg.journals + + if booktitles is None: + # GET THE VENUES WE LIKE from config.py + self.booktitles = cfg.booktitles + print('Conference of Interest: {}'.format(cfg.booktitles)) + else: + self.booktitles = booktitles + print('Conference of Interest: {}'.format(self.booktitles)) + + if journals is None: + # GET THE VENUES WE LIKE from config.py + self.journals = cfg.journals + print('Journals of Interest: {}'.format(cfg.journals)) + else: + self.journals = journals + print('Journals of Interest: {}'.format(self.journals)) for booktitle in self.booktitles: print("Processing booktitle: {}".format(booktitle))