-
Notifications
You must be signed in to change notification settings - Fork 15
/
config.ini
120 lines (86 loc) · 2.51 KB
/
config.ini
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
target_column = text
_PARALLEL = True
_VERBOSE = True
[import_data]
input_data_directories = datasets,
output_data_directory = data_import
merge_columns = "title", "abstract"
[phrase]
f_abbreviations = abbreviations.csv
output_data_directory = data_document_scores/
[parse]
output_data_directory = data_parsed
pipeline = unidecoder, dedash, titlecaps, replace_acronyms, separated_parenthesis, replace_from_dictionary, token_replacement, decaps_text, pos_tokenizer
[[replace_from_dictionary]]
suffix = '_MeSH'
[[replace_acronyms]]
suffix = 'ABBR'
[[separated_parenthesis]]
# Only keep long parenthetical content
min_keep_length=10
[[pos_tokenizer]]
POS_blacklist = 'pronoun', 'verb', 'adjective', 'punctuation', 'possessive', 'symbol', 'cardinal', 'connector', 'adverb', 'unknown'
[embed]
input_data_directory = data_parsed
output_data_directory = data_embeddings
embedding_commands = w2v_embedding,
[[w2v_embedding]]
f_db = w2v.gensim
skip_gram = 0
hierarchical_softmax = 1
epoch_n = 30
window = 5
negative = 0
sample = 1e-5
size = 300
min_count = 10
[score]
output_data_directory = data_document_scores
f_db = document_scores.h5
count_commands = term_document_frequency, term_frequency,
score_commands = score_unique_IDF,
compute_reduced_representation = False
[[downsample_weights]]
# Downsample weights, adjust as needed (zero value has no effect)
aspect=1.0
way=1.0
implication=1.0
relevance=1.0
research=1.0
example=1.0
importance=1.0
emphasis=1.0
estimation=1.0
accuracy=1.0
estimate=1.0
variable=1.0
[[reduced_representation]]
n_components = 25
[[term_frequency]]
f_db = TF.csv
[[term_document_frequency]]
f_db = TDF.csv
[metacluster]
score_method = unique_IDF
subcluster_m = 2000
#subcluster_kn = 26
subcluster_kn = 25
subcluster_pcut = 0.80
subcluster_repeats = 1
output_data_directory = data_clustering
f_centroids = meta_cluster_centroids.h5
[analyze]
compute_dispersion = False
output_data_directory = results
master_columns = 'PMID',
topn_words_returned = 10
[[LIME_explainer]]
metacluster_cosine_minsim = 0.6
score_method = unique_IDF
n_lime_samples = 25 # Make this higher for more accuracy
n_lime_features = 50
n_estimators = 50
[predict]
categorical_columns="journal",
extra_columns="PMID",
use_SMOTE=False