From 3ee5423e7eba18cc577e35e0e4a7501b6a89ac91 Mon Sep 17 00:00:00 2001
From: Yan <2641574067@qq.com>
Date: Thu, 15 Sep 2022 19:57:13 +0800
Subject: [PATCH] update
---
config/__init__.py | 1 +
config/__pycache__/__init__.cpython-37.pyc | Bin 365 -> 0 bytes
dblpSearch.py | 69 ++++++++++++++----
{html_builder => htmlBuilder}/__init__.py | 0
htmlBuilder/resources/entry_temp.html | 20 +++++
htmlBuilder/resources/log_temp.html | 16 ++++
htmlBuilder/richBuilder.py | 63 ++++++++++++++++
.../__pycache__/__init__.cpython-37.pyc | Bin 1627 -> 0 bytes
requirements.txt | 3 +-
scraps/__pycache__/__init__.cpython-37.pyc | Bin 163 -> 0 bytes
.../__pycache__/dblpSearcher.cpython-37.pyc | Bin 3930 -> 0 bytes
scraps/dblpSearcher.py | 12 +++
settings.json | 2 +-
utils/__init__.py | 2 +-
utils/__pycache__/__init__.cpython-37.pyc | Bin 198 -> 0 bytes
utils/__pycache__/logging.cpython-37.pyc | Bin 928 -> 0 bytes
utils/__pycache__/path.cpython-37.pyc | Bin 501 -> 0 bytes
utils/parallel.py | 19 +++++
utils/path.py | 19 +++++
19 files changed, 209 insertions(+), 17 deletions(-)
delete mode 100644 config/__pycache__/__init__.cpython-37.pyc
rename {html_builder => htmlBuilder}/__init__.py (100%)
create mode 100644 htmlBuilder/resources/entry_temp.html
create mode 100644 htmlBuilder/resources/log_temp.html
create mode 100644 htmlBuilder/richBuilder.py
delete mode 100644 html_builder/__pycache__/__init__.cpython-37.pyc
delete mode 100644 scraps/__pycache__/__init__.cpython-37.pyc
delete mode 100644 scraps/__pycache__/dblpSearcher.cpython-37.pyc
delete mode 100644 utils/__pycache__/__init__.cpython-37.pyc
delete mode 100644 utils/__pycache__/logging.cpython-37.pyc
delete mode 100644 utils/__pycache__/path.cpython-37.pyc
create mode 100644 utils/parallel.py
diff --git a/config/__init__.py b/config/__init__.py
index 807f4e4..09e9b3e 100644
--- a/config/__init__.py
+++ b/config/__init__.py
@@ -5,3 +5,4 @@
with open(join(root, 'settings.json')) as f:
settings = json.load(f)
+
diff --git a/config/__pycache__/__init__.cpython-37.pyc b/config/__pycache__/__init__.cpython-37.pyc
deleted file mode 100644
index d51cf416840df6727e8dabfa52d30c59e090c5a8..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001
literal 365
zcmXwzF-`+95JhdT$!-FXQgDehIRGGpfPw;0fS`b7$#T|{uwmC;*_$G1a{#UYB|Qh=
z9B!$&0u{4GX5{y0#-1;q_xnA9@jW@#U#LHAvx_bkXL!N{VThq65v{V4R!p!e5lO`b
zrvw;g0()E9rszNl96Bc1vMqfZhz!Vk7qSm3dQ)_{BlqM9^??leFFKRkVG>hQc&jUj
zsh)*eH!qPdU12x<08LZaMHnrEv%`HHql(mpQTxg>-1kWfj)<93^P&W$BKOWUk-Hjf
zr1Qv2r%lXuZlA;-v`a1it}l+{V*nrIv$pcm<*N#86Xea>=tj$_^RM`&9OT%$CFFQB
q);0L6)lBBj&WnXqs<1_))Tmy^?4)vLRl+Gn?~oxfddLQJK>q=?)MbMJ
diff --git a/dblpSearch.py b/dblpSearch.py
index 4b23a69..6d04385 100644
--- a/dblpSearch.py
+++ b/dblpSearch.py
@@ -1,35 +1,76 @@
import scraps.dblpSearcher as db
-import html_builder as hbuild
+import htmlBuilder as hb
import os
import webbrowser
from config import settings
+import copy
+from utils.path import slugify
# load parameters
PUBLISHERS = settings['publishers']
NUMBER_PER_PUBLISHER = int(settings['number_per_publisher'])
NUMBER_PER_SEARCH = int(settings['number_per_search'])
-# init output file
-SAVE_PATH = 'output/dblp_output.html'
-SAVE_FULL_PATH = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), SAVE_PATH)
-SAVE_FULL_PATH = SAVE_FULL_PATH.replace('\\', '/')
-print(SAVE_FULL_PATH)
-target = "file:///{}".format(SAVE_FULL_PATH)
+
+
+def get_saving_path(name):
+ abs_path = 'output/{}.html'.format(name)
+ abs_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), abs_path)
+ abs_path = abs_path.replace('\\', '/')
+ return abs_path
+
+# print(SAVE_FULL_PATH)
+# target = "file:///{}".format(SAVE_FULL_PATH)
+
+
+def search(terms):
+ # terms = input("Search for: ")
+ info_ = None
+ while True:
+ m = input("Search by publishers? [y/n]:")
+ if m == 'y':
+ info_ = db.search_by_conference(terms, PUBLISHERS, NUMBER_PER_PUBLISHER)
+ break
+ elif m == 'n':
+ info_ = db.search(terms, NUMBER_PER_SEARCH)
+ break
+ else:
+ continue
+ return info_
+
+
+def merge_info(info1, info2):
+ info_ = copy.deepcopy(info1)
+ # get existed titles in info1
+ existed_titles = set()
+ for item in info1:
+ existed_titles.add(item['title'])
+ # append unseen info2 items to info1
+ for item in info2:
+ title = item['title']
+ if title not in existed_titles:
+ info_.append(item)
+ existed_titles.add(title)
+
+ return info_
+
# search
while True:
- terms = input("Search for: ")
+ terms = [input("Search for: ")]
+ info = search(terms[-1])
while True:
- mode = input("Search by publishers? [y/n]:")
+ mode = input("Continue searching for other terms? [y/n]:")
if mode == 'y':
- info = db.search_by_conference(terms, PUBLISHERS, NUMBER_PER_PUBLISHER)
- break
+ terms.append(input("Search for: "))
+ info = merge_info(info, search(terms[-1]))
elif mode == 'n':
- info = db.search(terms, NUMBER_PER_SEARCH)
break
else:
continue
+
info = db.sort_by_year(info)
- hbuild.save_as_html(info, SAVE_PATH, heading=terms)
- webbrowser.open('file://' + SAVE_FULL_PATH)
+ save_path = get_saving_path(slugify(terms))
+ hb.save_as_html(info, save_path, heading=str(terms))
+ webbrowser.open('file://' + save_path)
diff --git a/html_builder/__init__.py b/htmlBuilder/__init__.py
similarity index 100%
rename from html_builder/__init__.py
rename to htmlBuilder/__init__.py
diff --git a/htmlBuilder/resources/entry_temp.html b/htmlBuilder/resources/entry_temp.html
new file mode 100644
index 0000000..0da0901
--- /dev/null
+++ b/htmlBuilder/resources/entry_temp.html
@@ -0,0 +1,20 @@
+
+
+
+
+
+ link
+
w2XU~BeM&9 zY;3QBOqpG5^f8WF++MPEgE#OmW|WQVS*bg!hPsl7>viq2vC+M0Au1*BQ{;?mwg{Wj J{knd8`!DrhwFv+K diff --git a/requirements.txt b/requirements.txt index a7b439c..fabd88d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ scidownl~=0.2.7 termcolor~=1.1.0 colorama~=0.4.4 requests~=2.24.0 -urllib3~=1.25.11 \ No newline at end of file +urllib3~=1.25.11 +beautifulsoup4 diff --git a/scraps/__pycache__/__init__.cpython-37.pyc b/scraps/__pycache__/__init__.cpython-37.pyc deleted file mode 100644 index 35b181f7df0da17a254478576e86054c6451064a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 163 zcmZ?b<>g`kg4+iV#DnO^AOZ#$feZ&AE@lA|DGb33nv8xc8Hzx{2;!HIvsFxJacWU< zOk`qSj7xrUX>Mv>NpXyCWlCa6VoZ2`QFd`bVsdIsKv8~HYBEqJAh95|$fGnVrZ~AM jv7k66K0Y%qvm`!Vub}c4hfQvNN@-529mtl?K+FIDkvc2# diff --git a/scraps/__pycache__/dblpSearcher.cpython-37.pyc b/scraps/__pycache__/dblpSearcher.cpython-37.pyc deleted file mode 100644 index 15d7158c493cb4c3f62761dea903b1259daffcf0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3930 zcmb_f&2JmW72nwpa=DUdS=N_rCE4b~4FiWlTNf=<+ieotNQy*tV+C>+Du^X#D6O>Q zQnN#cB99*J&W1eIRD8As`!khy&%Le%Q0CvxjR9OYJD#(~9? zC@_U+F** PWzMmA$N1I8sB}h#spZ*P zUMVvdwfq-l9)7xw(=u}za6PEVf1Z(|=cXh%FO!s6Fp@L;tLNj{=!G)u_)5r5y5Xlv zlF^d3pB8%K#mw~5ZoktMa>Fm)v{s6=AFFhFQ8R?58)-BY=PdMLY#FuyQ@<^fj!$;- zMcCT&Rn!rM7%5a=b~pUc2V51MGU&wP?<-CkzO|=RFWp#Qr~UOr?yeUfy-pZk4|~y1 z4{r^vYkRjOVHmCUt(u)%xTCguZbe;{J2LF 8`Ov^>E3 4P_+01u5DyjfWS6ybLze;`~<9RbkX6jE4$XZ}!#r5J@hd8^cO{eaxIgtG@o; z(7f&kcDEnLL$khKb8_#4t*y^L`S6RayAN`^9d*I9REKda=SV{JgHyS8rA%ba&29Sd zz WK*sv`#F77N7^N1E92t z6?$MfrcLb(A5F6WykNsud*Y=&ym8jeUu`cU1L)gFG?-vy==4?l{U0Mfx}w`kI4q2I z3FL+dGA!5E*U1X7pD9$}5$hCG7gQd8_UV1SO1=g?NK#YHk*jz?)EJ;S8w9qJLdZ90 z4TYyEhj+;?B%TzR(xl;#IhIjoVHEi;Mke8fR6hwXSJ0t=Ld4G56Kh0a#DY5E-S8Z9 z_%E$a9!UhaF(>=YqqC%Z%HNu`&rbgzI6F>44#LLaV}5(L_G->P&qopFS8(l`sku)p z2MS-pkX)vUCbj<0Rd%W1_$@3-3BiV2B^8r5Y_&`pz+<6Ia+W$e1mnlX2tiq{WyZb% zZ3D_rOlEYe zv{`=~8(v8hsT$3r2Bly1n{m<# 0;0P4LcPb(_|ep%IH%;@3YPxr4Ve2kX~Hm8^WiGI!rpR#q9S8{EN+!`s`&U=6HM zJ|@ZYBQLWv@5FgZA<917qY%ZNUmBkq4-HkKoN6?a&5%*T6&OSNLxBOpA_%=LBwYU6 zv7!?W8ebtngc3J=bo_CLs6*-0+3z8DMj}!q&(oy!3RjvsaXK1J4NnUz9It`A6uJ57 zr=FJ*Aubl~^n6(%=PW~>=TzS?A6S{0y0@hKW9)02wX!4-=cTaM1AKx~+HZCuRd^tQ zATJe3T;Yzh+8u+L=iibBOXprieh-_qw+1DxxJF>puAp3~ISt%X9p?;lDohW~_3OpL zrsNpUPd$L}l&ILK*gC>31IM$V)iq%GiV45}A?9+o(dZzmHyS~?lQbjz33|4Oaa?|^ zgW^T*MPwk?Y4UQggz!`sMC Mxb?5pjiqks& z(d@z c@Dz9Q*C!~cn zC;(|IXin1(JGo^y?1)BViX3myV#+(Fh(Jdtor$_PI|=W{;x>_3N`e=_l`fJ{*L2NQ V%XPd}_cgcb&3T@8#f4s7_CI|?>UjVF diff --git a/scraps/dblpSearcher.py b/scraps/dblpSearcher.py index ed9add8..5543a66 100644 --- a/scraps/dblpSearcher.py +++ b/scraps/dblpSearcher.py @@ -37,6 +37,18 @@ def get_xml(terms, number, batch_size=100): xml += [fetch(url)] return xml +# def get_xml(terms, number): +# """ +# :param terms: string of searched terms +# :param number: number of results +# :param batch_size: number of results extracted from dblp each time +# :return: a list of xml strings +# """ +# xml = [] +# url = 'https://dblp.org/search/publ/api?q=' + str(terms) + '&h=' + str(number) +# xml += [fetch(url)] +# return xml + def get_attribute(info): """ diff --git a/settings.json b/settings.json index 85ac480..8f68849 100644 --- a/settings.json +++ b/settings.json @@ -1,6 +1,6 @@ { "publishers": [ - "sigcomm", "mobicom", "nsdi", "sensys", "mobisys", "imwut" , "ipsn", "infocom", "mobihoc" + "sigcomm", "mobicom", "nsdi", "sensys", "mobisys", "imwut" , "ubicomp", "ipsn", "infocom", "mobihoc", "sysml", "hotedgevideo", "MM", "VR" ], "number_per_search": 100, "number_per_publisher": 100, diff --git a/utils/__init__.py b/utils/__init__.py index 09dfeb5..f288e13 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -1,2 +1,2 @@ import colorama -colorama.init() \ No newline at end of file +colorama.init() diff --git a/utils/__pycache__/__init__.cpython-37.pyc b/utils/__pycache__/__init__.cpython-37.pyc deleted file mode 100644 index 6c3d58e12eb0efae373f10a495a225409470584c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 198 zcmZ?b<>g`kf-Jk^@d`ltF^B^LOhASM5EqL8i4=wu#vFzyhE#?Hj0+hUfxKV_O{SMX z83sR1rYMf&{G9xv#N5PNESY(kB`X< 7OMY@`Zfagh zag1+eN@7W3On81#c5y*sa%xOKQGQlxGEgQUu^_d`qcka|v?Md9I3_+GWM+K4UP0w8 W4x8Nkl+v73J4T@S#ULkgFaiMmwlswR diff --git a/utils/__pycache__/logging.cpython-37.pyc b/utils/__pycache__/logging.cpython-37.pyc deleted file mode 100644 index 37d9f6fb1d20161fd857e26cc3e871cd97259031..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 928 zcmYjQ&2G~`5Z<*N$8nPWpdJzOflAE*xCexwX@Me0tF%?Ald9TyC&BI78+ILN0-VZ~ zcL+ybX|J4k1y0O5L1L_(`DXUh?Ci{Xx7iE{%A@_qtmP8&%R2ugK=2HgUZCQHQ$gaT zr7=a{5l-gDF10aNcv&T`Py#OZc;(iKeMCO5BB~+^cnwj2DCBiSwb@pgF^oe97xf>6 zk~5NTc?25Qp}L{yQ*usn=N$oZ)dYIA#v6#%dGkA-2$fqBZ}2s~{*A<&oF0?b#&4{W zbX(L^6Dgzuj?jA}lk0i4f6#q#coCV(SOMhopugYi9riHbB5fY_&z|?X2i=z!QS8q^ z2>A&q2i>F76Qp=UF@<@jzrDkBQluHQ^hNY}duNmi$eDH9+dCr7A(t)wH9`y@K`eKo z2k2*@H5((Z eP>T1MlSdPk=0%#1%UFBD6w9JCAIWyKsvs(4G=wD&_>!ien5w)Tb!84^q}%dL zSf;uZ*gI2Y$%kpGz*NehsVWe_G}z|a>_3TG$3p+S?z9J|8k8QKvwW~8lW7Kdp$Bhg zoE2 eX~Om=>wf10ly_tp4F-W?aPgYzur;#1Cjd)h43W z9aCM=jH_olt`(rNa@xwa&D55yz%&XUGcm#O$=;=%eO$AsoO)cha5eE|yc~60Up*T< zLC4iJ*98;8G ?YQr-kwC5z78ZWtn9|{!<%eDNi!V krvjdO=;`~Y?6n%S;e@pO9ES!@gRW8E@u)`~8oIvw7j8TFlK=n! diff --git a/utils/__pycache__/path.cpython-37.pyc b/utils/__pycache__/path.cpython-37.pyc deleted file mode 100644 index 4c2d1914f8e71eea712744f366776f5c2da64ce2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 501 zcmY*V%}T>S5Z=ju|85@o3c2*;Nkqg7da)El#IR7(Y{0ZhH|!*bLT~yYdh|I2A0k&z zzJe!bHV8T}f8XqUGdmZP$xx6?2Oq0<@}HdiibKIUDISwiLP04iNF3;6H*pDbO0V)0 zpD &mswwuLAO+f?E-XU(_iUF?8NCZ5}h=l!U!YIw!>`nMS<8LA(MUHO`C65l ?e+GOcrS-L7TZ6eUU~o7H<05F0CcWCRSsBL9}>)7a(tdW|@UC)n_Z zX J q diff --git a/utils/parallel.py b/utils/parallel.py new file mode 100644 index 0000000..35770af --- /dev/null +++ b/utils/parallel.py @@ -0,0 +1,19 @@ +import threading + + +# https://stackoverflow.com/questions/38978652/how-to-protect-an-object-using-a-lock-in-python +class HidingLock(object): + def __init__(self, obj, lock=None): + self.lock = lock or threading.RLock() + self._obj = obj + + def __enter__(self): + self.lock.acquire() + return self._obj + + def __exit__(self, exc_type, exc_value, traceback): + self.lock.release() + + def set(self, obj): + with self: + self._obj = obj \ No newline at end of file diff --git a/utils/path.py b/utils/path.py index da6e3a6..f23d11e 100644 --- a/utils/path.py +++ b/utils/path.py @@ -1,6 +1,8 @@ import os.path as osp import os from pathlib import Path +import unicodedata +import re def mkdir(path): @@ -11,3 +13,20 @@ def mkdir(path): def path_parent(path): return Path(path).parent + +def slugify(value, allow_unicode=False): + """ + Taken from https://github.com/django/django/blob/master/django/utils/text.py + Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated + dashes to single dashes. Remove characters that aren't alphanumerics, + underscores, or hyphens. Convert to lowercase. Also strip leading and + trailing whitespace, dashes, and underscores. + """ + value = str(value) + if allow_unicode: + value = unicodedata.normalize('NFKC', value) + else: + value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') + value = re.sub(r'[^\w\s-]', '', value.lower()) + return re.sub(r'[-\s]+', '-', value).strip('-_') +