-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjwn_corpusreader.py
125 lines (116 loc) · 4.67 KB
/
jwn_corpusreader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# -*- coding: utf-8 -*-
from nltk.corpus.reader.wordnet import WordNetCorpusReader
import jp_wordnet as JPWN
class JapaneseWordNetCorpusReader(JPWN.JapaneseWordNetCorpusReader):
def __init__(self):
JPWN.JapaneseWordNetCorpusReader.__init__(self)
self.cache = {} #計算を早くするために一度計算した結果を保存しておく
def calcSimilarity(self, a, b):
"類似度の計算"
if not isinstance(a, str):
a = str(a)
if not isinstance(b, str):
b = str(b)
# キャッシュに保存するために順番を統一
if a > b:
a, b = b, a
# キャッシュに結果がのこっていないか調べる
if (a, b) in self.cache:
return self.cache[(a, b)]
# 類似度の計算
jsyn_a = self.synset(a)
jsyn_b = self.synset(b)
if jsyn_a and jsyn_b:
return (jsyn_a.path_similarity(jsyn_b),None,None)
else:
return (0,None,None)
'''class JapaneseWordNetCorpusReader(WordNetCorpusReader):
def __init__(self, root, filename):
WordNetCorpusReader.__init__(self, root, root)
import codecs
f=codecs.open(filename, encoding="utf-8")
self._jword2offset = {}
for line in f:
_cells = line.strip().split('\t')
_offset_pos = _cells[0]
_word = _cells[1]
if len(_cells)>2: _tag = _cells[2]
_offset, _pos = _offset_pos.split('-')
try:
self._jword2offset[_word].append({'offset': int(_offset), 'pos': _pos})
except:
self._jword2offset[_word]=[{'offset': int(_offset), 'pos': _pos}]
def __init__(self, root, filename):
WordNetCorpusReader.__init__(self, root, root)
import codecs
f=codecs.open(filename, encoding="utf-8")
self._jword2offset = {}
for line in f:
_cells = line.strip().split('\t')
_offset_pos = _cells[0]
_word = _cells[1]
if len(_cells)>2: _tag = _cells[2]
_offset, _pos = _offset_pos.split('-')
try:
self._jword2offset[_word].append({'offset': int(_offset), 'pos': _pos})
except:
self._jword2offset[_word]=[{'offset': int(_offset), 'pos': _pos}]'''
def synsets(self, word):
if word in self._jword2offset:
results = [ ]
for offset in (self._jword2offset[word]):
results.append(WordNetCorpusReader._synset_from_pos_and_offset(
self, offset['pos'], offset['offset']
))
return results
else:
return None
'''def calcSimilarity(self, word1, word2, calcType="max"):
synsets1 = self.synsets(word1)
synsets2 = self.synsets(word2)
if synsets1 is None or synsets2 is None:
return (0, None, None)
pos1s = [x.pos for x in synsets1]
pos2s = [x.pos for x in synsets2]
#入力に名詞以外が入っている場合
if 'n' not in pos1s or 'n' not in pos2s:
alt_word1 = []
alt_word2 = []
if 'a' in pos1s or 's' in pos1s:
alt_word = [word1[:-1] + "さ", word1[:-1] + "み", word1[:-1] + "け", word1[:-1] + "げ"]
if 'a' in pos2s or 's' in pos1s:
alt_word = [word2[:-1] + "さ", word2[:-1] + "み", word2[:-1] + "け", word2[:-1] + "げ"]
maxResult = (0, None, None)
for a_w1 in alt_word1:
for a_w2 in alt_word2:
r = self.calcSimilarity(a_w1, a_w2)
if r[0] > maxResult[0]:
maxResult = r
return maxResult
#入力が名詞だけの場合
else:
maxSynset1 = None
madSynset2 = None
maxSim = 0
for syn1 in synsets1:
for syn2 in synsets2:
s = syn1.path_similarity(syn2)
if s is not None:
if s > maxSim:
maxSynset1 = syn1
maxSynset2 = syn2
maxSim = s
try:
return (maxSim, maxSynset1, maxSynset2)
except UnboundLocalError:
return (0, None, None)'''
def maxSimilaryWord(self, baseWord, compareWords):
maxWord = None
maxSim = 0
for w in compareWords:
r = self.calcSimilarity(baseWord, w)
print(r)
if r[0] > maxSim:
maxWord = w
maxSim = r
return (maxWord, maxSim)