-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathheuristic_filtering.py
76 lines (53 loc) · 2.45 KB
/
heuristic_filtering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import re
from typing import List, Union, Set
def process_label(label: str, lowercase: bool = True, stop_words: Set[str] = None) -> Union[List[str], None]:
"""Heuristically filter and process label(s)"""
if not label:
return None
# Handle multi-labels
label_delimiters_regex = re.compile('|'.join([';', '/']))
labels = set(l.strip() for l in re.split(label_delimiters_regex, label))
filter_strings = ['section', 'etc', 'now', 'whereas', 'exhibit ',
'therefore', 'article', 'in witness whereof', 'schedule', 'article']
filtered_labels = set([])
for label in labels:
if len(label) < 3 or len(label) > 75 or \
not label[0].isupper() or \
any(bw for bw in filter_strings if label.lower().startswith(bw)):
continue
if label[-1] in ['.', ':']: # remove scraping artifacts
label = label[:-1]
label = re.sub('[ \t]+', ' ', label.replace('\n', ' ').strip())
if label:
if stop_words:
if label.lower() in stop_words:
continue
label_words = label.split(' ')
if len(label_words) > 1:
if len(label_words[-1]) > 1 and label_words[-1].lower() in stop_words:
continue
if (label_words[0].lower() in stop_words or label_words[0].lower() in {'without', 'due'}) and \
label_words[0].lower() not in {'other', 'further', 'no', 'not', 'own', 'off'}:
continue
label = label.lower() if lowercase else label
filtered_labels.add(label)
return list(filtered_labels)
def process_text(text: str) -> Union[str, None]:
"""Heuristically filter and process provision text"""
text = text.strip()
filter_strings = ["” means", '" means', 'shall mean', "' means", '’ means'
'shall have the meaning', 'has the meaning', 'have meaning']
if len(text) < 25 or \
text[0].islower() or \
text[0] in ['"', '”'] or \
any(bw for bw in filter_strings if bw in text):
return None
text = text.strip()
if text[0] in ['.', ':']:
text = text[1:].strip()
if not text[0].isupper() and not text[0] in ['(', '[']:
return None
if not text[-1] == '.':
return None
text = re.sub('[ \t]+', ' ', text.replace('\n', ' ').strip())
return text