-
Notifications
You must be signed in to change notification settings - Fork 22
/
reg.py
117 lines (96 loc) · 3.56 KB
/
reg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
__author__='thiagocastroferreira'
"""
Author: Thiago Castro Ferreira
Date: 15/07/2018
Description:
Extract referring expressions by overlapping texts and their respective delexicalized templates.
For English and German.
PYTHON VERSION: 2.7
"""
import nltk
import re
from entry import Reference
def process_template(template):
'''
Return previous and subsequent tokens from a specific tag in a template
:param template:
:return:
'''
stemplate = template.split()
tag = ''
pre_tag, pos_tag, i = [], [], 0
for token in stemplate:
i += 1
if token.split('-')[0] in ['AGENT', 'PATIENT', 'BRIDGE']:
tag = token
for pos_token in stemplate[i:]:
if pos_token.split('-')[0] in ['AGENT', 'PATIENT', 'BRIDGE']:
break
else:
pos_tag.append(pos_token)
break
else:
pre_tag.append(token)
return pre_tag, tag, pos_tag
def classify_reference(refex):
refex = refex.lower().strip()
if refex in ['he', 'his', 'him', 'she', 'hers', 'her', 'it', 'its', 'we', 'our', 'ours', 'they', 'theirs', 'them']:
return 'pronoun'
token = refex.split()[0]
if token in ['the', 'a', 'an']:
return 'description'
elif token in ['this', 'these', 'that', 'those']:
return 'demonstrative'
else:
return 'name'
def extract_references(text, template, entitymap):
text = re.sub(r'(.+)-([1-9]+)-(.+)', r'\1-\2 -\3', text, flags=re.U)
text = ' '.join(nltk.word_tokenize(text))
template = re.sub(r'(.+)-([1-9]+)-(.+)', r'\1-\2 -\3', template, flags=re.U)
template = ' '.join(nltk.word_tokenize(template))
refexes = []
isOver = False
number = 0
while not isOver:
pre_tag, tag, pos_tag = process_template(template)
number += 1
if tag == '':
isOver = True
else:
# Look for reference from 5-gram to 2-gram
i, f = 5, []
while i > 1:
begin = ' '.join(i * ['BEGIN'])
text = begin + ' ' + text
template = begin + ' ' + template
pre_tag, tag, pos_tag = process_template(template)
regex = re.escape(' '.join(pre_tag[-i:]).strip()) + ' (.+?) ' + re.escape(' '.join(pos_tag[:i]).strip())
f = re.findall(regex, text)
template = template.replace('BEGIN', '').strip()
text = text.replace('BEGIN', '').strip()
i -= 1
if len(f) == 1:
break
if len(f) > 0:
# DO NOT LOWER CASE HERE!!!!!!
template = template.replace(tag, f[0], 1)
refex = f[0]
# Do not include literals
try:
entity = entitymap[tag]
except:
entity = ''
print('entity map exception...', tag)
reftype = classify_reference(refex)
refexes.append(Reference(tag=tag, entity=entity, refex=refex, number=number, reftype=reftype))
else:
template = template.replace(tag, ' ', 1)
return refexes
def run(entryset, lng='en'):
for entry in entryset:
for lex in entry.lexEntries:
if lng == 'en':
lex.references = extract_references(lex.text, lex.template, entry.entitymap_to_dict())
else:
lex.references_de = extract_references(lex.text_de, lex.template_de, entry.entitymap_to_dict())
return entryset