-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathweigher.go
59 lines (48 loc) · 1.47 KB
/
weigher.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
package tfidf
import (
"math"
"strings"
)
// New receives the document string, the document storage and returns the weigher structure
func New(ds DocumentStorage) *Weigher {
return &Weigher{ds}
}
// DocumentStorage encapsulates the functionality that is needed to perform the algorithm
type DocumentStorage interface {
// DocumentsWith receives a t term parameter and returns an unsigned integer of the documents count containing t
DocumentsWith(t string) uint
// Documents returns the total amount of documents within the storage
Documents() uint
}
// Weigher encapsulates the given document and the document storage where these values need to be given
type Weigher struct {
// d the given document string
ds DocumentStorage
}
// Score calculates the TF-IDF of a given document string. The string should contain words separated by one space only
func (w *Weigher) Score(d string) map[string]float64 {
terms := strings.Split(d, " ")
// tf terms frequencies within the given document
tf := make(map[string]int)
// tt total terms count within a given document
tt := len(terms)
for i := 0; i < tt; i++ {
tf[terms[i]]++
}
// tft tf(t) term frequency of t
tfidf := make(map[string]float64, len(tf))
for term, freq := range tf {
tft := float64(freq) / float64(tt)
dwt := float64(w.ds.DocumentsWith(term))
var idf float64
if 0 == dwt {
idf = 0
} else {
idf = math.Log10(
float64(w.ds.Documents()) / dwt,
)
}
tfidf[term] = tft * idf
}
return tfidf
}