-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathreader.py
60 lines (51 loc) · 1.89 KB
/
reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# Patty reader
# ===== imports =====
from pprint import pprint as pp
import utils
# ===== definitions =====
class PattyReader:
# attributes
content = []
patterns = {}
weights = {}
# constructor
def __init__(self,path):
with open(path) as f:
self.content = f.readlines()
self.content = [x.strip() for x in self.content]
# methods
def processData(self):
# remove first
for line in self.content[1:]:
parts = line.split('\t')
relation = parts[0]
pattern = parts[1]
# process pattern
pattern = self.__fix_pattern__(pattern)
if not self.patterns.has_key(parts[0]):
self.patterns[relation] = []# [self.__make_relation_as_pattern__(relation)]
self.patterns[relation].append(pattern)
totalCount = float(len(self.content)-1)
for relation in self.patterns:
count = len(self.patterns[relation])
self.weights[relation] = 1. - count/totalCount
def printPreview(self):
pp(self.patterns.items()[0:2])
def __make_relation_as_pattern__(self,relation):
return utils.splitCamelCase(relation)
# private methods
def __fix_pattern__(self,pattern):
return pattern.replace(';','')\
.replace('[[det]]','determiner')\
.replace('[[pro]]','pronoun')\
.replace('[[adj]]','adjective')\
.replace('[[num]]','number')\
.replace('[[con]]','conjunction')\
.replace('[[prp]]','preposition')\
.replace('[[mod]]','modal')
# ===== main testing =====
if __name__ == "__main__":
path = 'dbpedia-relation-paraphrases_json.txt'
reader = PattyReader(path)
reader.processData()
reader.printPreview()