-
Notifications
You must be signed in to change notification settings - Fork 1
/
match.go
96 lines (71 loc) · 1.76 KB
/
match.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
// Copyright 2017 Josh Komoroske. All rights reserved.
// Use of this source code is governed by an MIT-style
// license that can be found in the LICENSE.txt file.
package licensor
import (
"bytes"
"regexp"
"sync"
"github.com/joshdk/licensor/spdx"
)
var (
reWords = regexp.MustCompile(`[\w']+`)
)
// Best returns the license that is the closest match to the given text.
func Best(unknown []byte) Match {
return Closest(unknown, 1.0)
}
// Closest returns the first license with a match confidence that exceeds
// threshold, or the closest match to the given text if none meet the
// threshold.
func Closest(unknown []byte, threshold float64) Match {
var (
unknownWords = wordSet(unknown)
matches = make(chan Match)
wg sync.WaitGroup
)
for _, license := range spdx.All() {
wg.Add(1)
go func(license spdx.License) {
defer wg.Done()
licenseWords := wordSet([]byte(license.Text))
confidence := dice(licenseWords, unknownWords)
matches <- Match{
Confidence: confidence,
License: license,
}
}(license)
}
go func() {
wg.Wait()
close(matches)
}()
best := <-matches
for match := range matches {
switch {
case best.Confidence >= threshold:
return best
case match.Confidence > best.Confidence:
best = match
}
}
return best
}
func dice(reference map[string]struct{}, target map[string]struct{}) float64 {
var common int
for w := range target {
if _, ok := reference[w]; ok {
common++
}
}
return 2 * float64(common) / float64(len(target)+len(reference))
}
func wordSet(data []byte) map[string]struct{} {
words := map[string]struct{}{}
data = bytes.ToLower(data)
matches := reWords.FindAll(data, -1)
for _, match := range matches {
words[string(match)] = struct{}{}
}
return words
}