From 3ee5423e7eba18cc577e35e0e4a7501b6a89ac91 Mon Sep 17 00:00:00 2001 From: Yan <2641574067@qq.com> Date: Thu, 15 Sep 2022 19:57:13 +0800 Subject: [PATCH] update --- config/__init__.py | 1 + config/__pycache__/__init__.cpython-37.pyc | Bin 365 -> 0 bytes dblpSearch.py | 69 ++++++++++++++---- {html_builder => htmlBuilder}/__init__.py | 0 htmlBuilder/resources/entry_temp.html | 20 +++++ htmlBuilder/resources/log_temp.html | 16 ++++ htmlBuilder/richBuilder.py | 63 ++++++++++++++++ .../__pycache__/__init__.cpython-37.pyc | Bin 1627 -> 0 bytes requirements.txt | 3 +- scraps/__pycache__/__init__.cpython-37.pyc | Bin 163 -> 0 bytes .../__pycache__/dblpSearcher.cpython-37.pyc | Bin 3930 -> 0 bytes scraps/dblpSearcher.py | 12 +++ settings.json | 2 +- utils/__init__.py | 2 +- utils/__pycache__/__init__.cpython-37.pyc | Bin 198 -> 0 bytes utils/__pycache__/logging.cpython-37.pyc | Bin 928 -> 0 bytes utils/__pycache__/path.cpython-37.pyc | Bin 501 -> 0 bytes utils/parallel.py | 19 +++++ utils/path.py | 19 +++++ 19 files changed, 209 insertions(+), 17 deletions(-) delete mode 100644 config/__pycache__/__init__.cpython-37.pyc rename {html_builder => htmlBuilder}/__init__.py (100%) create mode 100644 htmlBuilder/resources/entry_temp.html create mode 100644 htmlBuilder/resources/log_temp.html create mode 100644 htmlBuilder/richBuilder.py delete mode 100644 html_builder/__pycache__/__init__.cpython-37.pyc delete mode 100644 scraps/__pycache__/__init__.cpython-37.pyc delete mode 100644 scraps/__pycache__/dblpSearcher.cpython-37.pyc delete mode 100644 utils/__pycache__/__init__.cpython-37.pyc delete mode 100644 utils/__pycache__/logging.cpython-37.pyc delete mode 100644 utils/__pycache__/path.cpython-37.pyc create mode 100644 utils/parallel.py diff --git a/config/__init__.py b/config/__init__.py index 807f4e4..09e9b3e 100644 --- a/config/__init__.py +++ b/config/__init__.py @@ -5,3 +5,4 @@ with open(join(root, 'settings.json')) as f: settings = json.load(f) + diff --git a/config/__pycache__/__init__.cpython-37.pyc b/config/__pycache__/__init__.cpython-37.pyc deleted file mode 100644 index d51cf416840df6727e8dabfa52d30c59e090c5a8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 365 zcmXwzF-`+95JhdT$!-FXQgDehIRGGpfPw;0fS`b7$#T|{uwmC;*_$G1a{#UYB|Qh= z9B!$&0u{4GX5{y0#-1;q_xnA9@jW@#U#LHAvx_bkXL!N{VThq65v{V4R!p!e5lO`b zrvw;g0()E9rszNl96Bc1vMqfZhz!Vk7qSm3dQ)_{BlqM9^??leFFKRkVG>hQc&jUj zsh)*eH!qPdU12x<08LZaMHnrEv%`HHql(mpQTxg>-1kWfj)<93^P&W$BKOWUk-Hjf zr1Qv2r%lXuZlA;-v`a1it}l+{V*nrIv$pcm<*N#86Xea>=tj$_^RM`&9OT%$CFFQB q);0L6)lBBj&WnXqs<1_))Tmy^?4)vLRl+Gn?~oxfddLQJK>q=?)MbMJ diff --git a/dblpSearch.py b/dblpSearch.py index 4b23a69..6d04385 100644 --- a/dblpSearch.py +++ b/dblpSearch.py @@ -1,35 +1,76 @@ import scraps.dblpSearcher as db -import html_builder as hbuild +import htmlBuilder as hb import os import webbrowser from config import settings +import copy +from utils.path import slugify # load parameters PUBLISHERS = settings['publishers'] NUMBER_PER_PUBLISHER = int(settings['number_per_publisher']) NUMBER_PER_SEARCH = int(settings['number_per_search']) -# init output file -SAVE_PATH = 'output/dblp_output.html' -SAVE_FULL_PATH = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), SAVE_PATH) -SAVE_FULL_PATH = SAVE_FULL_PATH.replace('\\', '/') -print(SAVE_FULL_PATH) -target = "file:///{}".format(SAVE_FULL_PATH) + + +def get_saving_path(name): + abs_path = 'output/{}.html'.format(name) + abs_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), abs_path) + abs_path = abs_path.replace('\\', '/') + return abs_path + +# print(SAVE_FULL_PATH) +# target = "file:///{}".format(SAVE_FULL_PATH) + + +def search(terms): + # terms = input("Search for: ") + info_ = None + while True: + m = input("Search by publishers? [y/n]:") + if m == 'y': + info_ = db.search_by_conference(terms, PUBLISHERS, NUMBER_PER_PUBLISHER) + break + elif m == 'n': + info_ = db.search(terms, NUMBER_PER_SEARCH) + break + else: + continue + return info_ + + +def merge_info(info1, info2): + info_ = copy.deepcopy(info1) + # get existed titles in info1 + existed_titles = set() + for item in info1: + existed_titles.add(item['title']) + # append unseen info2 items to info1 + for item in info2: + title = item['title'] + if title not in existed_titles: + info_.append(item) + existed_titles.add(title) + + return info_ + # search while True: - terms = input("Search for: ") + terms = [input("Search for: ")] + info = search(terms[-1]) while True: - mode = input("Search by publishers? [y/n]:") + mode = input("Continue searching for other terms? [y/n]:") if mode == 'y': - info = db.search_by_conference(terms, PUBLISHERS, NUMBER_PER_PUBLISHER) - break + terms.append(input("Search for: ")) + info = merge_info(info, search(terms[-1])) elif mode == 'n': - info = db.search(terms, NUMBER_PER_SEARCH) break else: continue + info = db.sort_by_year(info) - hbuild.save_as_html(info, SAVE_PATH, heading=terms) - webbrowser.open('file://' + SAVE_FULL_PATH) + save_path = get_saving_path(slugify(terms)) + hb.save_as_html(info, save_path, heading=str(terms)) + webbrowser.open('file://' + save_path) diff --git a/html_builder/__init__.py b/htmlBuilder/__init__.py similarity index 100% rename from html_builder/__init__.py rename to htmlBuilder/__init__.py diff --git a/htmlBuilder/resources/entry_temp.html b/htmlBuilder/resources/entry_temp.html new file mode 100644 index 0000000..0da0901 --- /dev/null +++ b/htmlBuilder/resources/entry_temp.html @@ -0,0 +1,20 @@ + +

+ + + + link +

+
+ + + Error: Embedded data could not be displayed. + + + + + + + +
+ \ No newline at end of file diff --git a/htmlBuilder/resources/log_temp.html b/htmlBuilder/resources/log_temp.html new file mode 100644 index 0000000..1566be7 --- /dev/null +++ b/htmlBuilder/resources/log_temp.html @@ -0,0 +1,16 @@ + + + + + + + + + + +

+

+
+
+ + \ No newline at end of file diff --git a/htmlBuilder/richBuilder.py b/htmlBuilder/richBuilder.py new file mode 100644 index 0000000..9db828d --- /dev/null +++ b/htmlBuilder/richBuilder.py @@ -0,0 +1,63 @@ +from os.path import dirname, realpath, join +import config +from utils.path import mkdir +from copy import copy +from bs4 import BeautifulSoup +from utils.logging import * + + +# logging path +_logging_path = join(config.root, "output") +mkdir(_logging_path) + +# resource path +_dir_path_of_this_file = dirname(realpath(__file__)) +_resource_path = join(_dir_path_of_this_file, 'resources') + +# load templates +with open(join(_resource_path, 'entry_temp.html')) as _f: + _entry_temp = BeautifulSoup(_f, features="html.parser").find('a') + +# make soup with temp +with open(join(_resource_path, 'log_temp.html')) as f: + html_soup = BeautifulSoup(f, features="html.parser") + + +def get_logging_file_path(filename: str) -> str: + path = join(_logging_path, filename) + return path + + +def _write_entry(title: list, paper_link: str, id_: int): + # create entry from temp + entry = copy(_entry_temp) + # sign entry + entry['data-target'] = '#{}'.format(id_) + entry.div['id'] = '{}'.format(id_) + entry.p.a['herf'] = paper_link + # embed title + cnt = 0 + for index, item in enumerate(entry.p.children): + if item.name == "span": + item.string = str(title[cnt]) + cnt += 1 + # embed sub-page + entry.div.object['data'] = paper_link + entry.div.object.embed['src'] = paper_link + # entry.div.iframe['src'] = paper_link + # add to soup + html_soup.html.body.div.append(entry) + + +def save_as_html(info, topic): + saving_path = get_logging_file_path("PaperHub_Searching_Result__{}.html".format(topic)) + # body + for index, item in enumerate(info): + _write_entry([item['year'], item['venue'], item['title']], item['ee'], index) + # header + html_soup.html.head.title.string = "PaperHub: {}".format(topic) + html_soup.html.body.h1.string = topic + html_soup.html.body.h3.string = '{} results'.format(str(len(info))) + with open(saving_path, 'w', encoding='utf-8') as f_: + f_.write(str(html_soup)) + log('{}{} results have been successfully saved to {}'.format(STD_INFO, str(len(info)), saving_path)) diff --git a/html_builder/__pycache__/__init__.cpython-37.pyc b/html_builder/__pycache__/__init__.cpython-37.pyc deleted file mode 100644 index 23131f39934a3c06c93c668f58f10dacd194d5ab..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1627 zcmZ`(&2Jk;6rb4-d;Qg>A<75pVNR8d+DM2Ca;pk80Tl^CX-Wetqt#?*Y;QB(U1r`k zwQ(+_*QgSIL3`xR-@;*zaN-V^o_Mo%AV@IQyqWj)dvD*o=LfxBo8XG#yZp}vA-_51 za`|}q2)B8FL=Zt0iQVZb`eb=}tJAMfUj@@J4k;0l2t@dj#tqRB&6gx@3VKNTt=|!c z9Q3KB?;(kd1jzESNnIefFagrSy@%WUj0DItDyZ--=$XIpUp=K{;R*kwrEh@;{=z$_ z=ado%z=v=_94CS;JSll`3mTWQee&(*SpZFlM7Tp1WD)F;bB~hm9v+b+BK$AOWAZhD z7PJ?LP$PC0)TJ(>3;)%|S-1#acQKxSPTwH*6`vkl6Rj<>pv$Pj$Ioof|7bH@hCi2# zGnE-&#faU@@=C8V z>EfA8!3-bqQtF4*@z9kuIj%A#q#h;-J}@N7ZaK4E*F*_7iL;2@SZpF8QI|GoM7wmu zTc^zL(lxqG^$u!M&*^cwvx|EVxA_&x0ta{phxf$Op8)}i9quEK^MW^=i!CA=^}Xrt zD2q_ry6{B%wTHf)9rA{X?gh?G{^JU(hgiQLmcPQfR$E?2KSW+yi?v#d_~IWea{tm| z#4B3l{?<})<9{=WFp~%Zv3^c}pn~A!y>np0WlJ@M9C7SE8_cjD=iY5*!{;)uf^d`)G!yfcy6cmjj(&&n5Z$4Qcw2XU~BeM&9 zY;3QBOqpG5^f8WF++MPEgE#OmW|WQVS*bg!hPsl7>viq2vC+M0Au1*BQ{;?mwg{Wj J{knd8`!DrhwFv+K diff --git a/requirements.txt b/requirements.txt index a7b439c..fabd88d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ scidownl~=0.2.7 termcolor~=1.1.0 colorama~=0.4.4 requests~=2.24.0 -urllib3~=1.25.11 \ No newline at end of file +urllib3~=1.25.11 +beautifulsoup4 diff --git a/scraps/__pycache__/__init__.cpython-37.pyc b/scraps/__pycache__/__init__.cpython-37.pyc deleted file mode 100644 index 35b181f7df0da17a254478576e86054c6451064a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 163 zcmZ?b<>g`kg4+iV#DnO^AOZ#$feZ&AE@lA|DGb33nv8xc8Hzx{2;!HIvsFxJacWU< zOk`qSj7xrUX>Mv>NpXyCWlCa6VoZ2`QFd`bVsdIsKv8~HYBEqJAh95|$fGnVrZ~AM jv7k66K0Y%qvm`!Vub}c4hfQvNN@-529mtl?K+FIDkvc2# diff --git a/scraps/__pycache__/dblpSearcher.cpython-37.pyc b/scraps/__pycache__/dblpSearcher.cpython-37.pyc deleted file mode 100644 index 15d7158c493cb4c3f62761dea903b1259daffcf0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3930 zcmb_f&2JmW72nwpa=DUdS=N_rCE4b~4FiWlTNf=<+ieotNQy*tV+C>+Du^X#D6O>Q zQnN#cB99*J&W1eIRD8As`!khy&%Le%Q0CvxjR9OYJD#(~9? zC@_U+F**PWzMmA$N1I8sB}h#spZ*P zUMVvdwfq-l9)7xw(=u}za6PEVf1Z(|=cXh%FO!s6Fp@L;tLNj{=!G)u_)5r5y5Xlv zlF^d3pB8%K#mw~5ZoktMa>Fm)v{s6=AFFhFQ8R?58)-BY=PdMLY#FuyQ@<^fj!$;- zMcCT&Rn!rM7%5a=b~pUc2V51MGU&wP?<-CkzO|=RFWp#Qr~UOr?yeUfy-pZk4|~y1 z4{r^vYkRjOVHmCUt(u)%xTCguZbe;{J2LF8`Ov^>E34P_+01u5DyjfWS6ybLze;`~<9RbkX6jE4$XZ}!#r5J@hd8^cO{eaxIgtG@o; z(7f&kcDEnLL$khKb8_#4t*y^L`S6RayAN`^9d*I9REKda=SV{JgHyS8rA%ba&29Sd zzWK*sv`#F77N7^N1E92t z6?$MfrcLb(A5F6WykNsud*Y=&ym8jeUu`cU1L)gFG?-vy==4?l{U0Mfx}w`kI4q2I z3FL+dGA!5E*U1X7pD9$}5$hCG7gQd8_UV1SO1=g?NK#YHk*jz?)EJ;S8w9qJLdZ90 z4TYyEhj+;?B%TzR(xl;#IhIjoVHEi;Mke8fR6hwXSJ0t=Ld4G56Kh0a#DY5E-S8Z9 z_%E$a9!UhaF(>=YqqC%Z%HNu`&rbgzI6F>44#LLaV}5(L_G->P&qopFS8(l`sku)p z2MS-pkX)vUCbj<0Rd%W1_$@3-3BiV2B^8r5Y_&`pz+<6Ia+W$e1mnlX2tiq{WyZb% zZ3D_rOlEYe zv{`=~8(v8hsT$3r2Bly1n{m<#0;0P4LcPb(_|ep%IH%;@3YPxr4Ve2kX~Hm8^WiGI!rpR#q9S8{EN+!`s`&U=6HM zJ|@ZYBQLWv@5FgZA<917qY%ZNUmBkq4-HkKoN6?a&5%*T6&OSNLxBOpA_%=LBwYU6 zv7!?W8ebtngc3J=bo_CLs6*-0+3z8DMj}!q&(oy!3RjvsaXK1J4NnUz9It`A6uJ57 zr=FJ*Aubl~^n6(%=PW~>=TzS?A6S{0y0@hKW9)02wX!4-=cTaM1AKx~+HZCuRd^tQ zATJe3T;Yzh+8u+L=iibBOXprieh-_qw+1DxxJF>puAp3~ISt%X9p?;lDohW~_3OpL zrsNpUPd$L}l&ILK*gC>31IM$V)iq%GiV45}A?9+o(dZzmHyS~?lQbjz33|4Oaa?|^ zgW^T*MPwk?Y4UQggz!`sMCMxb?5pjiqks& z(d@zc@Dz9Q*C!~cn zC;(|IXin1(JGo^y?1)BViX3myV#+(Fh(Jdtor$_PI|=W{;x>_3N`e=_l`fJ{*L2NQ V%XPd}_cgcb&3T@8#f4s7_CI|?>UjVF diff --git a/scraps/dblpSearcher.py b/scraps/dblpSearcher.py index ed9add8..5543a66 100644 --- a/scraps/dblpSearcher.py +++ b/scraps/dblpSearcher.py @@ -37,6 +37,18 @@ def get_xml(terms, number, batch_size=100): xml += [fetch(url)] return xml +# def get_xml(terms, number): +# """ +# :param terms: string of searched terms +# :param number: number of results +# :param batch_size: number of results extracted from dblp each time +# :return: a list of xml strings +# """ +# xml = [] +# url = 'https://dblp.org/search/publ/api?q=' + str(terms) + '&h=' + str(number) +# xml += [fetch(url)] +# return xml + def get_attribute(info): """ diff --git a/settings.json b/settings.json index 85ac480..8f68849 100644 --- a/settings.json +++ b/settings.json @@ -1,6 +1,6 @@ { "publishers": [ - "sigcomm", "mobicom", "nsdi", "sensys", "mobisys", "imwut" , "ipsn", "infocom", "mobihoc" + "sigcomm", "mobicom", "nsdi", "sensys", "mobisys", "imwut" , "ubicomp", "ipsn", "infocom", "mobihoc", "sysml", "hotedgevideo", "MM", "VR" ], "number_per_search": 100, "number_per_publisher": 100, diff --git a/utils/__init__.py b/utils/__init__.py index 09dfeb5..f288e13 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -1,2 +1,2 @@ import colorama -colorama.init() \ No newline at end of file +colorama.init() diff --git a/utils/__pycache__/__init__.cpython-37.pyc b/utils/__pycache__/__init__.cpython-37.pyc deleted file mode 100644 index 6c3d58e12eb0efae373f10a495a225409470584c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 198 zcmZ?b<>g`kf-Jk^@d`ltF^B^LOhASM5EqL8i4=wu#vFzyhE#?Hj0+hUfxKV_O{SMX z83sR1rYMf&{G9xv#N5PNESY(kB`X<7OMY@`Zfagh zag1+eN@7W3On81#c5y*sa%xOKQGQlxGEgQUu^_d`qcka|v?Md9I3_+GWM+K4UP0w8 W4x8Nkl+v73J4T@S#ULkgFaiMmwlswR diff --git a/utils/__pycache__/logging.cpython-37.pyc b/utils/__pycache__/logging.cpython-37.pyc deleted file mode 100644 index 37d9f6fb1d20161fd857e26cc3e871cd97259031..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 928 zcmYjQ&2G~`5Z<*N$8nPWpdJzOflAE*xCexwX@Me0tF%?Ald9TyC&BI78+ILN0-VZ~ zcL+ybX|J4k1y0O5L1L_(`DXUh?Ci{Xx7iE{%A@_qtmP8&%R2ugK=2HgUZCQHQ$gaT zr7=a{5l-gDF10aNcv&T`Py#OZc;(iKeMCO5BB~+^cnwj2DCBiSwb@pgF^oe97xf>6 zk~5NTc?25Qp}L{yQ*usn=N$oZ)dYIA#v6#%dGkA-2$fqBZ}2s~{*A<&oF0?b#&4{W zbX(L^6Dgzuj?jA}lk0i4f6#q#coCV(SOMhopugYi9riHbB5fY_&z|?X2i=z!QS8q^ z2>A&q2i>F76Qp=UF@<@jzrDkBQluHQ^hNY}duNmi$eDH9+dCr7A(t)wH9`y@K`eKo z2k2*@H5((ZeP>T1MlSdPk=0%#1%UFBD6w9JCAIWyKsvs(4G=wD&_>!ien5w)Tb!84^q}%dL zSf;uZ*gI2Y$%kpGz*NehsVWe_G}z|a>_3TG$3p+S?z9J|8k8QKvwW~8lW7Kdp$Bhg zoE2eX~Om=>wf10ly_tp4F-W?aPgYzur;#1Cjd)h43W z9aCM=jH_olt`(rNa@xwa&D55yz%&XUGcm#O$=;=%eO$AsoO)cha5eE|yc~60Up*T< zLC4iJ*98;8G?YQr-kwC5z78ZWtn9|{!<%eDNi!V krvjdO=;`~Y?6n%S;e@pO9ES!@gRW8E@u)`~8oIvw7j8TFlK=n! diff --git a/utils/__pycache__/path.cpython-37.pyc b/utils/__pycache__/path.cpython-37.pyc deleted file mode 100644 index 4c2d1914f8e71eea712744f366776f5c2da64ce2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 501 zcmY*V%}T>S5Z=ju|85@o3c2*;Nkqg7da)El#IR7(Y{0ZhH|!*bLT~yYdh|I2A0k&z zzJe!bHV8T}f8XqUGdmZP$xx6?2Oq0<@}HdiibKIUDISwiLP04iNF3;6H*pDbO0V)0 zpD&mswwuLAO+f?E-XU(_iUF?8NCZ5}h=l!U!YIw!>`nMS<8LA(MUHO`C65l?e+GOcrS-L7TZ6eUU~o7H<05F0CcWCRSsBL9}>)7a(tdW|@UC)n_Z zXJq diff --git a/utils/parallel.py b/utils/parallel.py new file mode 100644 index 0000000..35770af --- /dev/null +++ b/utils/parallel.py @@ -0,0 +1,19 @@ +import threading + + +# https://stackoverflow.com/questions/38978652/how-to-protect-an-object-using-a-lock-in-python +class HidingLock(object): + def __init__(self, obj, lock=None): + self.lock = lock or threading.RLock() + self._obj = obj + + def __enter__(self): + self.lock.acquire() + return self._obj + + def __exit__(self, exc_type, exc_value, traceback): + self.lock.release() + + def set(self, obj): + with self: + self._obj = obj \ No newline at end of file diff --git a/utils/path.py b/utils/path.py index da6e3a6..f23d11e 100644 --- a/utils/path.py +++ b/utils/path.py @@ -1,6 +1,8 @@ import os.path as osp import os from pathlib import Path +import unicodedata +import re def mkdir(path): @@ -11,3 +13,20 @@ def mkdir(path): def path_parent(path): return Path(path).parent + +def slugify(value, allow_unicode=False): + """ + Taken from https://github.com/django/django/blob/master/django/utils/text.py + Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated + dashes to single dashes. Remove characters that aren't alphanumerics, + underscores, or hyphens. Convert to lowercase. Also strip leading and + trailing whitespace, dashes, and underscores. + """ + value = str(value) + if allow_unicode: + value = unicodedata.normalize('NFKC', value) + else: + value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') + value = re.sub(r'[^\w\s-]', '', value.lower()) + return re.sub(r'[-\s]+', '-', value).strip('-_') +