forked from PaddlePaddle/PaddleHub
-
Notifications
You must be signed in to change notification settings - Fork 0
/
semantic_matching.py
76 lines (62 loc) · 2.23 KB
/
semantic_matching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import numpy as np
from paddlehub.common.logger import logger
from lda_webpage.vocab import OOV
EPS = 1e-06
class WordAndDis(object):
def __init__(self):
self.word = None
self.distance = None
class SemanticMatching(object):
def __init__(self):
pass
def l2_norm(self, vec):
"""Calculate the length of vector.
"""
result = np.sqrt(np.sum(vec**2))
return result
def cosine_similarity(self, vec1, vec2):
norm1 = self.l2_norm(vec1)
norm2 = self.l2_norm(vec2)
result = np.sum(vec1 * vec2) / norm1 / norm2
return result
def likelihood_based_similarity(self, terms, doc_topic_dist, model):
"""
Args:
terms: list of strings
doc_topic_dist: list of Topic class
model: TopicModel class
"""
num_of_term_in_vocab = 0
result = 0
for i in range(len(terms)):
term_id = model.term_id(terms[i])
if term_id == OOV:
continue
num_of_term_in_vocab += 1
for j in range(len(doc_topic_dist)):
topic_id = doc_topic_dist[j].tid
prob = doc_topic_dist[j].prob
result += model.word_topic_value(term_id, topic_id) * 1.0 / \
model.topic_sum_value(topic_id) * prob
if num_of_term_in_vocab == 0:
return result
return result / num_of_term_in_vocab
def kullback_leibler_divergence(self, dist1, dist2):
assert dist1.shape == dist2.shape
dist2[dist2 < EPS] = EPS
result = np.sum(dist1 * np.log(dist1 / dist2))
return result
def jensen_shannon_divergence(self, dist1, dist2):
assert dist1.shape == dist2.shape
dist1[dist1 < EPS] = EPS
dist2[dist2 < EPS] = EPS
mean = (dist1 + dist2) * 0.5
jsd = self.kullback_leibler_divergence(dist1, mean) * 0.5 + \
self.kullback_leibler_divergence(dist2, mean) * 0.5
return jsd
def hellinger_distance(self, dist1, dist2):
assert dist1.shape == dist2.shape
result = np.sum((np.sqrt(dist1) - np.sqrt(dist2))**2)
result = np.sqrt(result) * 0.7071067812
return result