-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractor.py
84 lines (69 loc) · 2.64 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# opensourced
import re
import string
# Predefined strings.
numbers = "(^a(?=\s)|one|two|three|four|five|six|seven|\
eight|nine|ten|eleven|twelve|thirteen|fourteen|\
fifteen|sixteen|seventeen|eighteen|nineteen|twenty|\
thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand)"
day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)"
week_day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)"
month = "(january|february|march|april|may|june|july|august|september|\
october|november|december)"
dmy = "(year|day|week|month)"
rel_day = "(today|yesterday|tomorrow|tonight|tonite)"
exp1 = "(before|after|earlier|later|ago)"
exp2 = "(this|next|last)"
iso = "\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+"
year = "((?<=\s)\d{4}|^\d{4})"
date = "([012]?[0-9]|3[01])"
regxp1 = "((\d+|(" + numbers + "[-\s]?)+) " + dmy + "s? " + exp1 + ")"
regxp2 = "(" + exp2 + " (" + dmy + "|" + week_day + "|" + month + "))"
regxp3 = "(" + date + " " + month + " " + year + ")"
regxp4 = "(" + month + " " + date + "[th|st|rd]?[,]? " + year + ")"
reg1 = re.compile(regxp1, re.IGNORECASE)
reg2 = re.compile(regxp2, re.IGNORECASE)
reg3 = re.compile(rel_day, re.IGNORECASE)
reg4 = re.compile(iso)
reg5 = re.compile(year)
reg6 = re.compile(regxp3, re.IGNORECASE)
reg7 = re.compile(regxp4, re.IGNORECASE)
def extractDate(text):
# Initialization
timex_found = []
# re.findall() finds all the substring matches, keep only the full
# matching string. Captures expressions such as 'number of days' ago, etc.
found = reg1.findall(text)
found = [a[0] for a in found if len(a) > 1]
for timex in found:
timex_found.append(timex)
# Variations of this thursday, next year, etc
found = reg2.findall(text)
found = [a[0] for a in found if len(a) > 1]
for timex in found:
timex_found.append(timex)
# today, tomorrow, etc
found = reg3.findall(text)
for timex in found:
timex_found.append(timex)
# ISO
found = reg4.findall(text)
for timex in found:
timex_found.append(timex)
# Dates
found = reg6.findall(text)
found = [a[0] for a in found if len(a) > 1]
for timex in found:
timex_found.append(timex)
found = reg7.findall(text)
found = [a[0] for a in found if len(a) > 1]
for timex in found:
timex_found.append(timex)
# Year
found = reg5.findall(text)
for timex in found:
timex_found.append(timex)
# Tag only temporal expressions which haven't been tagged.
#for timex in timex_found:
# text = re.sub(timex + '(?!</TIMEX2>)', '<TIMEX2>' + timex + '</TIMEX2>', text)
return timex_found