-
Notifications
You must be signed in to change notification settings - Fork 0
/
ADVextractor.py
261 lines (195 loc) · 10 KB
/
ADVextractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#Code to extract adverbs (ADV) from CoNLL-U files, and to present their distribution and various statistics. Please refer to
#Edition note: some slight improvements and tweaks have been implemented, so the data produced could not coincide 100% with those presented in the paper, but substantially it does
import os, sys
from collections import namedtuple, Counter, defaultdict
#Importing own scripts for CoNLL-U manipulations
sys.path.insert(0, './Tools/')
import CoNLLUTools
#TECHNICAL NOTE: Theoretically, Udapi could be used for this. But because of inaccessibility and impenetrability of its documentation,
#at the moment I find it easier to do very simple tree-search operations as the ones performed here,
#by means of own-created code. Every suggestion to make the script more open is welcome.
#Input
try :
folder = sys.argv[1]
except (IndexError) :
print("Please insert the CoNLL-U files you want to analyse, either as a single document, a folder path, or an orderly comma-separated list of file/paths (can be mixed).")
quit()
#Hardcoded CoNNL-U extension, but the code might possibly include Plus, too
extensions = ('.conllu',)
#Extraction of documents
texts = []
for doc in folder.strip().split(',') :
if os.path.isdir(doc) :
for t , _, fili in os.walk(doc) :
texts.extend([os.path.join(t,f) for f in fili if os.path.splitext(f)[1] in extensions ])
else :
if os.path.splitext(doc)[1] in extensions :
texts.append(doc)
#
#Creation of output name
from pathlib import Path
output = 'ADV_' + '_'.join(map(lambda x : Path(x).stem,texts))
if not os.path.exists(output):
os.makedirs(output)
#Specific named tuples to handle adverbs
ADV = namedtuple('ADV','form lemma pos morpho deprel head ddeprel')
ADV.__new__.__defaults__ = ('','_','ADV','','',None,(),(),None,None,None,None)#*len(ADV._fields)
#Lits which will be used
adverbs = []
adverbials = []
forms = set()
obliques = []
advcoord = []
#Definitions of relations classes we need, hardcoded from UD tools
alldeprel = {'acl','advcl','advmod','amod','appos','aux','case','cc','ccomp','clf','compound','conj','cop','csubj','dep','det','discourse','dislocated','expl','fixed','flat','goeswith','iobj','list','mark','nmod','nsubj','nummod','obj','obl','orphan','parataxis','punct','reparandum','root','vocative','xcomp'} #taken from UD folder tools/data/deprel.ud, as of v2.14
clauseheads = {'root','parataxis','csubj','ccomp','xcomp','advcl','acl'} #conj copying the function if it depends on another clause head
verbfunc = {'aux','cop'}
roots = {'root','parataxis','acl'}
horizontal = {'conj','flat','fixed','list'}
nonrelations = {'dep','orphan','punct','reparandum','goeswith'}
#Collection of data
for text in texts :
print(text)
for s,a in CoNLLUTools.readCoNLLU(text) :
print(s['sent_id'],end='\r')
for n in CoNLLUTools.syntacticwords(a) :
#We act modulo horizontal (i.e. co-ordinative) structures
tnode = CoNLLUTools.truehead(a,n.id,conj=horizontal)
trel = tnode.deprel.split(':')[0] #no subtypes
thead = tnode.head
if n.upos == 'ADV' and trel not in nonrelations :
#Basic features
adv = ADV(form = n.form.lower(), lemma = n.lemma.lower(), morpho = CoNLLUTools.writeUDfeatures(n.feats), deprel = tnode.deprel.split(':')[0])
#If it the ADV is not the head of a predicate, we fetch some context...
if tnode.deprel not in clauseheads :
#We consider the (true) head of the node
hnode = a.nodes[thead]['features']
hnucleus = CoNLLUTools.extractnucleus(a,hnode.id)
#We define some macrocategories for the head of the ADV: PRED for a (synthetic or periphrastic) predication, NOM for nominals
hpos = 'PRED' if (hnode.deprel.split(':')[0] in clauseheads or hnode.upos in ('VERB','AUX') or {'cop','aux'}.intersection(hnucleus.deprels)) else hnode.upos
hpos = 'NOM' if hpos in ('NOUN','PROPN','PRON') else hpos
#We collect the data about the ADV head and syntactic distances
adv = adv._replace(head = hpos)
#Co-ordinated ADVs
if n.deprel.split(':')[0] == 'conj' and hnode.upos == 'ADV' :
advcoord.append((n.lemma.lower(), hnode.lemma.lower()))
else : #The ADV is itself the head of a clause
adv = adv._replace(head = 'ROOT')
#We collect information about possible ADV's dependents with meaningful relations #In horizontal constructions, we look only at "local dependents", not at possible common dependents of the whole construction
adv = adv._replace(ddeprel = tuple(sorted(CoNLLUTools.extractnucleus(a,n.id,funcrel = alldeprel - (horizontal | nonrelations)).deprels))) #ddeprel = tuple(sorted([a.nodes[nn]['features'].deprel for nn in a.successors(tnode.id) if not a.nodes[nn]['features'].deprel.startswith(horizontal+('punct',)) and a.nodes[nn]['features'].id != n.id]))
#We add the ADV profile we have so found to the list
adverbs.append(adv)
#
#We save all forms of non-ADV elements to compare them with ADVs
elif n.upos != 'ADV' :
forms.add((n.form.lower(), n.upos, n.lemma.lower(), CoNLLUTools.writeUDfeatures(n.feats)))
#We save any other elements tagged with adverbial relations
if tnode.deprel.split(':')[0] == 'advmod' :
adverbials.append(ADV(form = n.form.lower(), lemma = n.lemma.lower(), pos=n.upos, morpho = CoNLLUTools.writeUDfeatures(n.feats), deprel = tnode.deprel.split(':')[0]))
#
#We prepare and lay out the data collected so far
cadv = Counter(ad.form for ad in adverbs) #forms over lemmas, because not every treebank has lemmas
#
ladv = defaultdict(set)
for ad in adverbs :
ladv[ad.form].add(ad.lemma)
#
radv = defaultdict(Counter)
for ad in adverbs :
radv[ad.form][ad.deprel] += 1
#
tadv = defaultdict(Counter)
for ad in adverbs :
tadv[ad.form]['PRED' if ad.head in ('ADJ','DET','NUM','ADV') else ad.head] += 1 #We conflate into the PRED macrocategory also all modifiers
#
dadv = defaultdict(Counter)
for ad in adverbs :
for d in ad.ddeprel :
dadv[ad.form][d] += 1
#
#General distribution of ADV types
with open(os.path.join(output,'ADV_distr.tsv'),'w',encoding='utf8') as advex :
modified = sorted(filter(None,set().union(*[t.keys() for _,t in tadv.items()]))) #All UPOS appearing as heads of an ADV
advex.write('Form type\tLemmas\tFrequency\t{}\n'.format('\t'.join(modified)))
for dv in cadv :
advex.write('{}\t{}\t{}\t{}\n'.format( dv,\
','.join(ladv[dv]),\
str(cadv[dv]),\
'\t'.join(str(tadv[dv].get(t,0)/cadv[dv]) for t in modified),\
))
#
#We investigate ADV form types coinciding with forms of other parts of speech
coincidences = {a.form for a in adverbs} & {f[0] for f in forms}
with open(os.path.join(output,'ADV_coinc.tsv'),'w',encoding='utf8') as advex :
for fc in [f for f in forms if f[0] in coincidences] :
advex.write('{}\n'.format('\t'.join(fc)))
#
#We investigate nominal-like dependents of ADVs
dnom = ('nmod', 'appos', 'nummod', 'acl', 'amod', 'det', 'clf', 'case', 'cop') #nominal dependents + the copula, which implies the ADV is not metapredicating
nomdependents = defaultdict(dict)
nominallike = set()
threshold = 5 #we want to avoid noise and find some regular patterns
for adv,ddiz in dadv.items() :
for dr,dc in ddiz.items() :
if dr.startswith(dnom) and cadv[adv] > threshold :
nomdependents[dr][adv] = dc/cadv[adv]
nominallike.add(adv)
with open(os.path.join(output,'ADV_nominals.tsv'),'w',encoding='utf8') as advex :
for d in nomdependents :
advex.write('{}\t{}\n\n'.format(d, ' '.join(['/'.join(map(str,i)) for i in sorted(nomdependents[d].items(), key = lambda x : x[1], reverse=True)]) ))
#
#We print ADVs having a form different from the lemma
difforms = defaultdict(lambda : defaultdict(set))
#The transformation in terms of prefixoid and suffixoid substitution to go from A to B is found
def findstringtransformation(AB) :
import difflib
from collections import namedtuple
A,B = AB
Transformation = namedtuple('Transformation', 'Apref Bpref Asuff Bsuff')
Transformation.__new__.__defaults__ = ('','','','')
commons = difflib.SequenceMatcher(None, A, B).find_longest_match(0,len(A),0,len(B))
return Transformation(len(A[:commons.a]),B[:commons.b],len(A[commons.a+commons.size:]),B[commons.b+commons.size:])
#
for af,al in ladv.items() :
for aall in set(map(str.lower,al)) - {af.lower()} :
difforms['|'.join(map(str,findstringtransformation((aall,af.lower()))))][af.lower()].add(aall)
with open(os.path.join(output,'ADV_difflemma.tsv'),'w',encoding='utf8') as advex :
for df,fl in difforms.items() :
for f,ll in fl.items() :
advex.write('{}\t{}\t{}\n'.format( df, f, ','.join(ll) ))
#
#We print all morpholexical properties associated to ADVs
advmorph = set().union(*[adv.morpho.split('|') for adv in adverbs])
with open(os.path.join(output,'ADV_morpho.tsv'),'w',encoding='utf8') as advex :
for m in sorted(advmorph) :
advex.write('{}\n'.format(m))
#
#Overview of what takes the relation advmod
##Outputs a Counter with normalised counts
def counternormalisation(c) :
from collections import Counter
nc = {}
total = sum(c.values())
for x,n in c.items() :
nc[x] = n / total
return nc
#
advmodcont = counternormalisation(Counter([adv.pos for adv in adverbials]))
posmod = defaultdict(Counter)
for adv in adverbials :
posmod[adv.pos][adv.lemma] += 1
posmod = {p:counternormalisation(cmod) for p,cmod in posmod.items()}
with open(os.path.join(output,'ADV_advmod.tsv'),'w',encoding='utf8') as advex :
advex.write('{}\n\n\n'.format('\n'.join(['\t'.join(map(str,c)) for c in sorted(advmodcont.items(),key = lambda x :x[1],reverse=True)])))
for p,c in posmod.items() :
advex.write('{}\t{}\n\n'.format(p, ' '.join([','.join(map(str,pc)) for pc in sorted(c.items(),key=lambda x : x[1],reverse=True)])))
#
#Groups of co-ordinated adverbs
import networkx as nx
from networkx.algorithms import connected_components
ADVconj = nx.Graph(advcoord)
connadv = connected_components(ADVconj)
with open(os.path.join(output,'ADV_coord.tsv'),'w',encoding='utf8') as advex :
for ac in sorted(list(connadv),key = lambda x : len(x),reverse=True) :
advex.write('{}\n\n'.format('\t'.join(map(lambda x : x.upper() if x in nominallike else x,ac))))
#