-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathdefault_config.py
66 lines (47 loc) · 2.27 KB
/
default_config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
## folder config. Please take care that each path string ends with a /
folder_dblp_xml = './data/'
folder_content_xml = './data/content_xml/'
folder_pdf = './data/pdf/'
folder_log = './data/logs/'
folder_datasets = './data/datasets/'
folder_classifiers = './data/classifiers/'
## mongoDB
mongoDB_IP = '127.0.0.1'
mongoDB_Port = 27017 # default local port. change this if you use SSH tunneling on your machine (likely 4321 or 27017).
mongoDB_db = 'pub'
## pdf extraction
grobid_url = '127.0.0.1:8080'
## conferences we like
booktitles = ['JCDL','SIGIR','ECDL','TPDL','TREC', 'ICWSM', 'ESWC', 'ICSR','WWW', 'ICSE', 'HRI', 'VLDB', 'ICRA', 'ICARCV']
#root to the project
ROOTHPATH='/Users/sepidehmesbah/PycharmProjects/SmartPub'
STANFORD_NER_PATH='/Users/sepidehmesbah/PycharmProjects/SmartPub/stanford_files/stanford-ner.jar'
## journals we like
journals = ['IEEE Trans. Robotics' , 'IEEE Trans. Robotics and Automation', 'IEEE J. Robotics and Automation']
## Update process
overwriteDBLP_XML = False
updateNow = True
checkDaily = True
checkWeekly = True
## Only pdf download
only_pdf_download = False
## Only text extraction
only_text_extraction = False
## Only classify and name entity extraction
only_classify_nee = True
####################### XML processing configurations #######################
# set to true if you want to persist to a local mongo DB (default connection)
storeToMongo = True
# set to true if you want to skip downloading EE entries (pdf URLs) which have been accessed before (either sucessfully or unsucessfully)
# this only works if storeToMongo is set to True because the MongoDB must be accessed for that. (if you set storeToMongo to false, I will
# just assume that MongoDB is simply not active / there
skipPreviouslyAccessedURLs = True
# the categories you are interested in
CATEGORIES = {'article', 'inproceedings', 'proceedings', 'book', 'incollection', 'phdthesis', 'mastersthesis', 'www'}
# the categories you are NOT interested in
SKIP_CATEGORIES = {'phdthesis', 'mastersthesis', 'www', 'proceedings'}
# the fields which should be in your each data item / mongo entry
DATA_ITEMS = ["title", "booktitle", "year", "journal", "crossref", "ee"]
statusEveryXdownloads = 100
statusEveryXxmlLoops = 1000
###############################################################################