diff --git a/config/__init__.py b/config/__init__.py index 807f4e4..09e9b3e 100644 --- a/config/__init__.py +++ b/config/__init__.py @@ -5,3 +5,4 @@ with open(join(root, 'settings.json')) as f: settings = json.load(f) + diff --git a/config/__pycache__/__init__.cpython-37.pyc b/config/__pycache__/__init__.cpython-37.pyc deleted file mode 100644 index d51cf41..0000000 Binary files a/config/__pycache__/__init__.cpython-37.pyc and /dev/null differ diff --git a/dblpSearch.py b/dblpSearch.py index 4b23a69..6d04385 100644 --- a/dblpSearch.py +++ b/dblpSearch.py @@ -1,35 +1,76 @@ import scraps.dblpSearcher as db -import html_builder as hbuild +import htmlBuilder as hb import os import webbrowser from config import settings +import copy +from utils.path import slugify # load parameters PUBLISHERS = settings['publishers'] NUMBER_PER_PUBLISHER = int(settings['number_per_publisher']) NUMBER_PER_SEARCH = int(settings['number_per_search']) -# init output file -SAVE_PATH = 'output/dblp_output.html' -SAVE_FULL_PATH = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), SAVE_PATH) -SAVE_FULL_PATH = SAVE_FULL_PATH.replace('\\', '/') -print(SAVE_FULL_PATH) -target = "file:///{}".format(SAVE_FULL_PATH) + + +def get_saving_path(name): + abs_path = 'output/{}.html'.format(name) + abs_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), abs_path) + abs_path = abs_path.replace('\\', '/') + return abs_path + +# print(SAVE_FULL_PATH) +# target = "file:///{}".format(SAVE_FULL_PATH) + + +def search(terms): + # terms = input("Search for: ") + info_ = None + while True: + m = input("Search by publishers? [y/n]:") + if m == 'y': + info_ = db.search_by_conference(terms, PUBLISHERS, NUMBER_PER_PUBLISHER) + break + elif m == 'n': + info_ = db.search(terms, NUMBER_PER_SEARCH) + break + else: + continue + return info_ + + +def merge_info(info1, info2): + info_ = copy.deepcopy(info1) + # get existed titles in info1 + existed_titles = set() + for item in info1: + existed_titles.add(item['title']) + # append unseen info2 items to info1 + for item in info2: + title = item['title'] + if title not in existed_titles: + info_.append(item) + existed_titles.add(title) + + return info_ + # search while True: - terms = input("Search for: ") + terms = [input("Search for: ")] + info = search(terms[-1]) while True: - mode = input("Search by publishers? [y/n]:") + mode = input("Continue searching for other terms? [y/n]:") if mode == 'y': - info = db.search_by_conference(terms, PUBLISHERS, NUMBER_PER_PUBLISHER) - break + terms.append(input("Search for: ")) + info = merge_info(info, search(terms[-1])) elif mode == 'n': - info = db.search(terms, NUMBER_PER_SEARCH) break else: continue + info = db.sort_by_year(info) - hbuild.save_as_html(info, SAVE_PATH, heading=terms) - webbrowser.open('file://' + SAVE_FULL_PATH) + save_path = get_saving_path(slugify(terms)) + hb.save_as_html(info, save_path, heading=str(terms)) + webbrowser.open('file://' + save_path) diff --git a/html_builder/__init__.py b/htmlBuilder/__init__.py similarity index 100% rename from html_builder/__init__.py rename to htmlBuilder/__init__.py diff --git a/htmlBuilder/resources/entry_temp.html b/htmlBuilder/resources/entry_temp.html new file mode 100644 index 0000000..0da0901 --- /dev/null +++ b/htmlBuilder/resources/entry_temp.html @@ -0,0 +1,20 @@ + +

+ + + + link +

+
+ + + Error: Embedded data could not be displayed. + + + + + + + +
+ \ No newline at end of file diff --git a/htmlBuilder/resources/log_temp.html b/htmlBuilder/resources/log_temp.html new file mode 100644 index 0000000..1566be7 --- /dev/null +++ b/htmlBuilder/resources/log_temp.html @@ -0,0 +1,16 @@ + + + + + + + + + + +

+

+
+
+ + \ No newline at end of file diff --git a/htmlBuilder/richBuilder.py b/htmlBuilder/richBuilder.py new file mode 100644 index 0000000..9db828d --- /dev/null +++ b/htmlBuilder/richBuilder.py @@ -0,0 +1,63 @@ +from os.path import dirname, realpath, join +import config +from utils.path import mkdir +from copy import copy +from bs4 import BeautifulSoup +from utils.logging import * + + +# logging path +_logging_path = join(config.root, "output") +mkdir(_logging_path) + +# resource path +_dir_path_of_this_file = dirname(realpath(__file__)) +_resource_path = join(_dir_path_of_this_file, 'resources') + +# load templates +with open(join(_resource_path, 'entry_temp.html')) as _f: + _entry_temp = BeautifulSoup(_f, features="html.parser").find('a') + +# make soup with temp +with open(join(_resource_path, 'log_temp.html')) as f: + html_soup = BeautifulSoup(f, features="html.parser") + + +def get_logging_file_path(filename: str) -> str: + path = join(_logging_path, filename) + return path + + +def _write_entry(title: list, paper_link: str, id_: int): + # create entry from temp + entry = copy(_entry_temp) + # sign entry + entry['data-target'] = '#{}'.format(id_) + entry.div['id'] = '{}'.format(id_) + entry.p.a['herf'] = paper_link + # embed title + cnt = 0 + for index, item in enumerate(entry.p.children): + if item.name == "span": + item.string = str(title[cnt]) + cnt += 1 + # embed sub-page + entry.div.object['data'] = paper_link + entry.div.object.embed['src'] = paper_link + # entry.div.iframe['src'] = paper_link + # add to soup + html_soup.html.body.div.append(entry) + + +def save_as_html(info, topic): + saving_path = get_logging_file_path("PaperHub_Searching_Result__{}.html".format(topic)) + # body + for index, item in enumerate(info): + _write_entry([item['year'], item['venue'], item['title']], item['ee'], index) + # header + html_soup.html.head.title.string = "PaperHub: {}".format(topic) + html_soup.html.body.h1.string = topic + html_soup.html.body.h3.string = '{} results'.format(str(len(info))) + with open(saving_path, 'w', encoding='utf-8') as f_: + f_.write(str(html_soup)) + log('{}{} results have been successfully saved to {}'.format(STD_INFO, str(len(info)), saving_path)) diff --git a/html_builder/__pycache__/__init__.cpython-37.pyc b/html_builder/__pycache__/__init__.cpython-37.pyc deleted file mode 100644 index 23131f3..0000000 Binary files a/html_builder/__pycache__/__init__.cpython-37.pyc and /dev/null differ diff --git a/requirements.txt b/requirements.txt index a7b439c..fabd88d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ scidownl~=0.2.7 termcolor~=1.1.0 colorama~=0.4.4 requests~=2.24.0 -urllib3~=1.25.11 \ No newline at end of file +urllib3~=1.25.11 +beautifulsoup4 diff --git a/scraps/__pycache__/__init__.cpython-37.pyc b/scraps/__pycache__/__init__.cpython-37.pyc deleted file mode 100644 index 35b181f..0000000 Binary files a/scraps/__pycache__/__init__.cpython-37.pyc and /dev/null differ diff --git a/scraps/__pycache__/dblpSearcher.cpython-37.pyc b/scraps/__pycache__/dblpSearcher.cpython-37.pyc deleted file mode 100644 index 15d7158..0000000 Binary files a/scraps/__pycache__/dblpSearcher.cpython-37.pyc and /dev/null differ diff --git a/scraps/dblpSearcher.py b/scraps/dblpSearcher.py index ed9add8..5543a66 100644 --- a/scraps/dblpSearcher.py +++ b/scraps/dblpSearcher.py @@ -37,6 +37,18 @@ def get_xml(terms, number, batch_size=100): xml += [fetch(url)] return xml +# def get_xml(terms, number): +# """ +# :param terms: string of searched terms +# :param number: number of results +# :param batch_size: number of results extracted from dblp each time +# :return: a list of xml strings +# """ +# xml = [] +# url = 'https://dblp.org/search/publ/api?q=' + str(terms) + '&h=' + str(number) +# xml += [fetch(url)] +# return xml + def get_attribute(info): """ diff --git a/settings.json b/settings.json index 85ac480..8f68849 100644 --- a/settings.json +++ b/settings.json @@ -1,6 +1,6 @@ { "publishers": [ - "sigcomm", "mobicom", "nsdi", "sensys", "mobisys", "imwut" , "ipsn", "infocom", "mobihoc" + "sigcomm", "mobicom", "nsdi", "sensys", "mobisys", "imwut" , "ubicomp", "ipsn", "infocom", "mobihoc", "sysml", "hotedgevideo", "MM", "VR" ], "number_per_search": 100, "number_per_publisher": 100, diff --git a/utils/__init__.py b/utils/__init__.py index 09dfeb5..f288e13 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -1,2 +1,2 @@ import colorama -colorama.init() \ No newline at end of file +colorama.init() diff --git a/utils/__pycache__/__init__.cpython-37.pyc b/utils/__pycache__/__init__.cpython-37.pyc deleted file mode 100644 index 6c3d58e..0000000 Binary files a/utils/__pycache__/__init__.cpython-37.pyc and /dev/null differ diff --git a/utils/__pycache__/logging.cpython-37.pyc b/utils/__pycache__/logging.cpython-37.pyc deleted file mode 100644 index 37d9f6f..0000000 Binary files a/utils/__pycache__/logging.cpython-37.pyc and /dev/null differ diff --git a/utils/__pycache__/path.cpython-37.pyc b/utils/__pycache__/path.cpython-37.pyc deleted file mode 100644 index 4c2d191..0000000 Binary files a/utils/__pycache__/path.cpython-37.pyc and /dev/null differ diff --git a/utils/parallel.py b/utils/parallel.py new file mode 100644 index 0000000..35770af --- /dev/null +++ b/utils/parallel.py @@ -0,0 +1,19 @@ +import threading + + +# https://stackoverflow.com/questions/38978652/how-to-protect-an-object-using-a-lock-in-python +class HidingLock(object): + def __init__(self, obj, lock=None): + self.lock = lock or threading.RLock() + self._obj = obj + + def __enter__(self): + self.lock.acquire() + return self._obj + + def __exit__(self, exc_type, exc_value, traceback): + self.lock.release() + + def set(self, obj): + with self: + self._obj = obj \ No newline at end of file diff --git a/utils/path.py b/utils/path.py index da6e3a6..f23d11e 100644 --- a/utils/path.py +++ b/utils/path.py @@ -1,6 +1,8 @@ import os.path as osp import os from pathlib import Path +import unicodedata +import re def mkdir(path): @@ -11,3 +13,20 @@ def mkdir(path): def path_parent(path): return Path(path).parent + +def slugify(value, allow_unicode=False): + """ + Taken from https://github.com/django/django/blob/master/django/utils/text.py + Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated + dashes to single dashes. Remove characters that aren't alphanumerics, + underscores, or hyphens. Convert to lowercase. Also strip leading and + trailing whitespace, dashes, and underscores. + """ + value = str(value) + if allow_unicode: + value = unicodedata.normalize('NFKC', value) + else: + value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') + value = re.sub(r'[^\w\s-]', '', value.lower()) + return re.sub(r'[-\s]+', '-', value).strip('-_') +