update

WaterHyacinthInNANHU · Sep 15, 2022 · 3ee5423 · 3ee5423
1 parent ed48a0f
commit 3ee5423
Show file tree

Hide file tree

Showing 19 changed files with 209 additions and 17 deletions.
diff --git a/config/__init__.py b/config/__init__.py
@@ -5,3 +5,4 @@
 with open(join(root, 'settings.json')) as f:
     settings = json.load(f)
 
+
diff --git a/config/__pycache__/__init__.cpython-37.pyc b/config/__pycache__/__init__.cpython-37.pyc
diff --git a/dblpSearch.py b/dblpSearch.py
@@ -1,35 +1,76 @@
 import scraps.dblpSearcher as db
-import html_builder as hbuild
+import htmlBuilder as hb
 import os
 import webbrowser
 from config import settings
+import copy
+from utils.path import slugify
 
 # load parameters
 PUBLISHERS = settings['publishers']
 NUMBER_PER_PUBLISHER = int(settings['number_per_publisher'])
 NUMBER_PER_SEARCH = int(settings['number_per_search'])
 
-# init output file
-SAVE_PATH = 'output/dblp_output.html'
-SAVE_FULL_PATH = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), SAVE_PATH)
-SAVE_FULL_PATH = SAVE_FULL_PATH.replace('\\', '/')
-print(SAVE_FULL_PATH)
-target = "file:///{}".format(SAVE_FULL_PATH)
+
+
+def get_saving_path(name):
+    abs_path = 'output/{}.html'.format(name)
+    abs_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), abs_path)
+    abs_path = abs_path.replace('\\', '/')
+    return abs_path
+
+# print(SAVE_FULL_PATH)
+# target = "file:///{}".format(SAVE_FULL_PATH)
+
+
+def search(terms):
+    # terms = input("Search for: ")
+    info_ = None
+    while True:
+        m = input("Search by publishers? [y/n]:")
+        if m == 'y':
+            info_ = db.search_by_conference(terms, PUBLISHERS, NUMBER_PER_PUBLISHER)
+            break
+        elif m == 'n':
+            info_ = db.search(terms, NUMBER_PER_SEARCH)
+            break
+        else:
+            continue
+    return info_
+
+
+def merge_info(info1, info2):
+    info_ = copy.deepcopy(info1)
+    # get existed titles in info1
+    existed_titles = set()
+    for item in info1:
+        existed_titles.add(item['title'])
+    # append unseen info2 items to info1
+    for item in info2:
+        title = item['title']
+        if title not in existed_titles:
+            info_.append(item)
+            existed_titles.add(title)
+
+    return info_
+
 
 # search
 while True:
-    terms = input("Search for: ")
+    terms = [input("Search for: ")]
+    info = search(terms[-1])
     while True:
-        mode = input("Search by publishers? [y/n]:")
+        mode = input("Continue searching for other terms? [y/n]:")
         if mode == 'y':
-            info = db.search_by_conference(terms, PUBLISHERS, NUMBER_PER_PUBLISHER)
-            break
+            terms.append(input("Search for: "))
+            info = merge_info(info, search(terms[-1]))
         elif mode == 'n':
-            info = db.search(terms, NUMBER_PER_SEARCH)
             break
         else:
             continue
+
     info = db.sort_by_year(info)
-    hbuild.save_as_html(info, SAVE_PATH, heading=terms)
-    webbrowser.open('file://' + SAVE_FULL_PATH)
+    save_path = get_saving_path(slugify(terms))
+    hb.save_as_html(info, save_path, heading=str(terms))
+    webbrowser.open('file://' + save_path)
 
diff --git a/html_builder/__init__.py → htmlBuilder/__init__.py b/html_builder/__init__.py → htmlBuilder/__init__.py
diff --git a/htmlBuilder/resources/entry_temp.html b/htmlBuilder/resources/entry_temp.html
@@ -0,0 +1,20 @@
+<a href="javascript:void(0);" class="list-group-item" data-toggle="collapse" data-target="#1">
+    <p>
+      <span style="color:#ffc907;"></span>
+      <span style="color:#2e9df7;"></span>
+      <span style="color:#231f20;"></span>
+      <a href="">link</a>
+    </p>
+    <div id="1" class="collapse">
+        <object data="http://www.web-source.net" width="600" height="400">
+            <embed src="http://www.web-source.net" width="600" height="400">
+            Error: Embedded data could not be displayed.
+        </object>
+<!--        <iframe id="inlineFrameExample"-->
+<!--        title="Inline Frame Example"-->
+<!--        width="300"-->
+<!--        height="200"-->
+<!--        src="http://www.web-source.net">-->
+<!--        </iframe>-->
+    </div>
+</a>
diff --git a/htmlBuilder/resources/log_temp.html b/htmlBuilder/resources/log_temp.html
@@ -0,0 +1,16 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css">
+  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
+  <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js"></script>
+  <title></title>
+</head>
+<body>
+  <h1></h1>
+  <h3></h3>
+  <div class="list-group">
+  </div>
+</body>
+</html>
diff --git a/htmlBuilder/richBuilder.py b/htmlBuilder/richBuilder.py
@@ -0,0 +1,63 @@
+from os.path import dirname, realpath, join
+import config
+from utils.path import mkdir
+from copy import copy
+from bs4 import BeautifulSoup
+from utils.logging import *
+
+
+# logging path
+_logging_path = join(config.root, "output")
+mkdir(_logging_path)
+
+# resource path
+_dir_path_of_this_file = dirname(realpath(__file__))
+_resource_path = join(_dir_path_of_this_file, 'resources')
+
+# load templates
+with open(join(_resource_path, 'entry_temp.html')) as _f:
+    _entry_temp = BeautifulSoup(_f, features="html.parser").find('a')
+
+# make soup with temp
+with open(join(_resource_path, 'log_temp.html')) as f:
+    html_soup = BeautifulSoup(f, features="html.parser")
+
+
+def get_logging_file_path(filename: str) -> str:
+    path = join(_logging_path, filename)
+    return path
+
+
+def _write_entry(title: list, paper_link: str, id_: int):
+    # create entry from temp
+    entry = copy(_entry_temp)
+    # sign entry
+    entry['data-target'] = '#{}'.format(id_)
+    entry.div['id'] = '{}'.format(id_)
+    entry.p.a['herf'] = paper_link
+    # embed title
+    cnt = 0
+    for index, item in enumerate(entry.p.children):
+        if item.name == "span":
+            item.string = str(title[cnt])
+            cnt += 1
+    # embed sub-page
+    entry.div.object['data'] = paper_link
+    entry.div.object.embed['src'] = paper_link
+    # entry.div.iframe['src'] = paper_link
+    # add to soup
+    html_soup.html.body.div.append(entry)
+
+
+def save_as_html(info, topic):
+    saving_path = get_logging_file_path("PaperHub_Searching_Result__{}.html".format(topic))
+    # body
+    for index, item in enumerate(info):
+        _write_entry([item['year'], item['venue'], item['title']], item['ee'], index)
+    # header
+    html_soup.html.head.title.string = "PaperHub: {}".format(topic)
+    html_soup.html.body.h1.string = topic
+    html_soup.html.body.h3.string = '{} results'.format(str(len(info)))
+    with open(saving_path, 'w', encoding='utf-8') as f_:
+        f_.write(str(html_soup))
+    log('{}{} results have been successfully saved to {}'.format(STD_INFO, str(len(info)), saving_path))
diff --git a/html_builder/__pycache__/__init__.cpython-37.pyc b/html_builder/__pycache__/__init__.cpython-37.pyc
diff --git a/requirements.txt b/requirements.txt
@@ -4,4 +4,5 @@ scidownl~=0.2.7
 termcolor~=1.1.0
 colorama~=0.4.4
 requests~=2.24.0
-urllib3~=1.25.11
+urllib3~=1.25.11
+beautifulsoup4
diff --git a/scraps/__pycache__/__init__.cpython-37.pyc b/scraps/__pycache__/__init__.cpython-37.pyc
diff --git a/scraps/__pycache__/dblpSearcher.cpython-37.pyc b/scraps/__pycache__/dblpSearcher.cpython-37.pyc
diff --git a/scraps/dblpSearcher.py b/scraps/dblpSearcher.py
@@ -37,6 +37,18 @@ def get_xml(terms, number, batch_size=100):
             xml += [fetch(url)]
     return xml
 
+# def get_xml(terms, number):
+#     """
+#     :param terms: string of searched terms
+#     :param number: number of results
+#     :param batch_size: number of results extracted from dblp each time
+#     :return: a list of xml strings
+#     """
+#     xml = []
+#     url = 'https://dblp.org/search/publ/api?q=' + str(terms) + '&h=' + str(number)
+#     xml += [fetch(url)]
+#     return xml
+
 
 def get_attribute(info):
     """

diff --git a/settings.json b/settings.json
@@ -1,6 +1,6 @@
 {
   "publishers": [
-    "sigcomm", "mobicom", "nsdi", "sensys", "mobisys", "imwut" , "ipsn", "infocom", "mobihoc"
+    "sigcomm", "mobicom", "nsdi", "sensys", "mobisys", "imwut" , "ubicomp", "ipsn", "infocom", "mobihoc", "sysml", "hotedgevideo", "MM", "VR"
   ],
   "number_per_search": 100,
   "number_per_publisher": 100,

diff --git a/utils/__init__.py b/utils/__init__.py
@@ -1,2 +1,2 @@
 import colorama
-colorama.init()
+colorama.init()
diff --git a/utils/__pycache__/__init__.cpython-37.pyc b/utils/__pycache__/__init__.cpython-37.pyc
diff --git a/utils/__pycache__/logging.cpython-37.pyc b/utils/__pycache__/logging.cpython-37.pyc
diff --git a/utils/__pycache__/path.cpython-37.pyc b/utils/__pycache__/path.cpython-37.pyc
diff --git a/utils/parallel.py b/utils/parallel.py
@@ -0,0 +1,19 @@
+import threading
+
+
+# https://stackoverflow.com/questions/38978652/how-to-protect-an-object-using-a-lock-in-python
+class HidingLock(object):
+    def __init__(self, obj, lock=None):
+        self.lock = lock or threading.RLock()
+        self._obj = obj
+
+    def __enter__(self):
+        self.lock.acquire()
+        return self._obj
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.lock.release()
+
+    def set(self, obj):
+        with self:
+            self._obj = obj
diff --git a/utils/path.py b/utils/path.py
@@ -1,6 +1,8 @@
 import os.path as osp
 import os
 from pathlib import Path
+import unicodedata
+import re
 
 
 def mkdir(path):
@@ -11,3 +13,20 @@ def mkdir(path):
 def path_parent(path):
     return Path(path).parent
 
+
+def slugify(value, allow_unicode=False):
+    """
+    Taken from https://github.com/django/django/blob/master/django/utils/text.py
+    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
+    dashes to single dashes. Remove characters that aren't alphanumerics,
+    underscores, or hyphens. Convert to lowercase. Also strip leading and
+    trailing whitespace, dashes, and underscores.
+    """
+    value = str(value)
+    if allow_unicode:
+        value = unicodedata.normalize('NFKC', value)
+    else:
+        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
+    value = re.sub(r'[^\w\s-]', '', value.lower())
+    return re.sub(r'[-\s]+', '-', value).strip('-_')
+
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,3 +5,4 @@
		with open(join(root, 'settings.json')) as f:
		settings = json.load(f)