Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
WaterHyacinthInNANHU committed Sep 15, 2022
1 parent ed48a0f commit 3ee5423
Show file tree
Hide file tree
Showing 19 changed files with 209 additions and 17 deletions.
1 change: 1 addition & 0 deletions config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
with open(join(root, 'settings.json')) as f:
settings = json.load(f)


Binary file removed config/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
69 changes: 55 additions & 14 deletions dblpSearch.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,76 @@
import scraps.dblpSearcher as db
import html_builder as hbuild
import htmlBuilder as hb
import os
import webbrowser
from config import settings
import copy
from utils.path import slugify

# load parameters
PUBLISHERS = settings['publishers']
NUMBER_PER_PUBLISHER = int(settings['number_per_publisher'])
NUMBER_PER_SEARCH = int(settings['number_per_search'])

# init output file
SAVE_PATH = 'output/dblp_output.html'
SAVE_FULL_PATH = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), SAVE_PATH)
SAVE_FULL_PATH = SAVE_FULL_PATH.replace('\\', '/')
print(SAVE_FULL_PATH)
target = "file:///{}".format(SAVE_FULL_PATH)


def get_saving_path(name):
abs_path = 'output/{}.html'.format(name)
abs_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), abs_path)
abs_path = abs_path.replace('\\', '/')
return abs_path

# print(SAVE_FULL_PATH)
# target = "file:///{}".format(SAVE_FULL_PATH)


def search(terms):
# terms = input("Search for: ")
info_ = None
while True:
m = input("Search by publishers? [y/n]:")
if m == 'y':
info_ = db.search_by_conference(terms, PUBLISHERS, NUMBER_PER_PUBLISHER)
break
elif m == 'n':
info_ = db.search(terms, NUMBER_PER_SEARCH)
break
else:
continue
return info_


def merge_info(info1, info2):
info_ = copy.deepcopy(info1)
# get existed titles in info1
existed_titles = set()
for item in info1:
existed_titles.add(item['title'])
# append unseen info2 items to info1
for item in info2:
title = item['title']
if title not in existed_titles:
info_.append(item)
existed_titles.add(title)

return info_


# search
while True:
terms = input("Search for: ")
terms = [input("Search for: ")]
info = search(terms[-1])
while True:
mode = input("Search by publishers? [y/n]:")
mode = input("Continue searching for other terms? [y/n]:")
if mode == 'y':
info = db.search_by_conference(terms, PUBLISHERS, NUMBER_PER_PUBLISHER)
break
terms.append(input("Search for: "))
info = merge_info(info, search(terms[-1]))
elif mode == 'n':
info = db.search(terms, NUMBER_PER_SEARCH)
break
else:
continue

info = db.sort_by_year(info)
hbuild.save_as_html(info, SAVE_PATH, heading=terms)
webbrowser.open('file://' + SAVE_FULL_PATH)
save_path = get_saving_path(slugify(terms))
hb.save_as_html(info, save_path, heading=str(terms))
webbrowser.open('file://' + save_path)

File renamed without changes.
20 changes: 20 additions & 0 deletions htmlBuilder/resources/entry_temp.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<a href="javascript:void(0);" class="list-group-item" data-toggle="collapse" data-target="#1">
<p>
<span style="color:#ffc907;"></span>
<span style="color:#2e9df7;"></span>
<span style="color:#231f20;"></span>
<a href="">link</a>
</p>
<div id="1" class="collapse">
<object data="http://www.web-source.net" width="600" height="400">
<embed src="http://www.web-source.net" width="600" height="400">
Error: Embedded data could not be displayed.
</object>
<!-- <iframe id="inlineFrameExample"-->
<!-- title="Inline Frame Example"-->
<!-- width="300"-->
<!-- height="200"-->
<!-- src="http://www.web-source.net">-->
<!-- </iframe>-->
</div>
</a>
16 changes: 16 additions & 0 deletions htmlBuilder/resources/log_temp.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<!DOCTYPE html>
<html>
<head>
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js"></script>
<title></title>
</head>
<body>
<h1></h1>
<h3></h3>
<div class="list-group">
</div>
</body>
</html>
63 changes: 63 additions & 0 deletions htmlBuilder/richBuilder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from os.path import dirname, realpath, join
import config
from utils.path import mkdir
from copy import copy
from bs4 import BeautifulSoup
from utils.logging import *


# logging path
_logging_path = join(config.root, "output")
mkdir(_logging_path)

# resource path
_dir_path_of_this_file = dirname(realpath(__file__))
_resource_path = join(_dir_path_of_this_file, 'resources')

# load templates
with open(join(_resource_path, 'entry_temp.html')) as _f:
_entry_temp = BeautifulSoup(_f, features="html.parser").find('a')

# make soup with temp
with open(join(_resource_path, 'log_temp.html')) as f:
html_soup = BeautifulSoup(f, features="html.parser")


def get_logging_file_path(filename: str) -> str:
path = join(_logging_path, filename)
return path


def _write_entry(title: list, paper_link: str, id_: int):
# create entry from temp
entry = copy(_entry_temp)
# sign entry
entry['data-target'] = '#{}'.format(id_)
entry.div['id'] = '{}'.format(id_)
entry.p.a['herf'] = paper_link
# embed title
cnt = 0
for index, item in enumerate(entry.p.children):
if item.name == "span":
item.string = str(title[cnt])
cnt += 1
# embed sub-page
entry.div.object['data'] = paper_link
entry.div.object.embed['src'] = paper_link
# entry.div.iframe['src'] = paper_link
# add to soup
html_soup.html.body.div.append(entry)


def save_as_html(info, topic):
saving_path = get_logging_file_path("PaperHub_Searching_Result__{}.html".format(topic))
# body
for index, item in enumerate(info):
_write_entry([item['year'], item['venue'], item['title']], item['ee'], index)
# header
html_soup.html.head.title.string = "PaperHub: {}".format(topic)
html_soup.html.body.h1.string = topic
html_soup.html.body.h3.string = '{} results'.format(str(len(info)))
with open(saving_path, 'w', encoding='utf-8') as f_:
f_.write(str(html_soup))
log('{}{} results have been successfully saved to {}'.format(STD_INFO, str(len(info)), saving_path))
Binary file removed html_builder/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ scidownl~=0.2.7
termcolor~=1.1.0
colorama~=0.4.4
requests~=2.24.0
urllib3~=1.25.11
urllib3~=1.25.11
beautifulsoup4
Binary file removed scraps/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Binary file removed scraps/__pycache__/dblpSearcher.cpython-37.pyc
Binary file not shown.
12 changes: 12 additions & 0 deletions scraps/dblpSearcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,18 @@ def get_xml(terms, number, batch_size=100):
xml += [fetch(url)]
return xml

# def get_xml(terms, number):
# """
# :param terms: string of searched terms
# :param number: number of results
# :param batch_size: number of results extracted from dblp each time
# :return: a list of xml strings
# """
# xml = []
# url = 'https://dblp.org/search/publ/api?q=' + str(terms) + '&h=' + str(number)
# xml += [fetch(url)]
# return xml


def get_attribute(info):
"""
Expand Down
2 changes: 1 addition & 1 deletion settings.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"publishers": [
"sigcomm", "mobicom", "nsdi", "sensys", "mobisys", "imwut" , "ipsn", "infocom", "mobihoc"
"sigcomm", "mobicom", "nsdi", "sensys", "mobisys", "imwut" , "ubicomp", "ipsn", "infocom", "mobihoc", "sysml", "hotedgevideo", "MM", "VR"
],
"number_per_search": 100,
"number_per_publisher": 100,
Expand Down
2 changes: 1 addition & 1 deletion utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
import colorama
colorama.init()
colorama.init()
Binary file removed utils/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Binary file removed utils/__pycache__/logging.cpython-37.pyc
Binary file not shown.
Binary file removed utils/__pycache__/path.cpython-37.pyc
Binary file not shown.
19 changes: 19 additions & 0 deletions utils/parallel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import threading


# https://stackoverflow.com/questions/38978652/how-to-protect-an-object-using-a-lock-in-python
class HidingLock(object):
def __init__(self, obj, lock=None):
self.lock = lock or threading.RLock()
self._obj = obj

def __enter__(self):
self.lock.acquire()
return self._obj

def __exit__(self, exc_type, exc_value, traceback):
self.lock.release()

def set(self, obj):
with self:
self._obj = obj
19 changes: 19 additions & 0 deletions utils/path.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os.path as osp
import os
from pathlib import Path
import unicodedata
import re


def mkdir(path):
Expand All @@ -11,3 +13,20 @@ def mkdir(path):
def path_parent(path):
return Path(path).parent


def slugify(value, allow_unicode=False):
"""
Taken from https://github.com/django/django/blob/master/django/utils/text.py
Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
dashes to single dashes. Remove characters that aren't alphanumerics,
underscores, or hyphens. Convert to lowercase. Also strip leading and
trailing whitespace, dashes, and underscores.
"""
value = str(value)
if allow_unicode:
value = unicodedata.normalize('NFKC', value)
else:
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
value = re.sub(r'[^\w\s-]', '', value.lower())
return re.sub(r'[-\s]+', '-', value).strip('-_')

0 comments on commit 3ee5423

Please sign in to comment.