Skip to content

Commit

Permalink
Merge pull request #113 from go-ego/range-pr
Browse files Browse the repository at this point in the history
Optimize load dictionary and stop by embed files, support user dictio…
  • Loading branch information
vcaesar authored Sep 12, 2021
2 parents ff04d83 + 11bdec4 commit 30fc7f1
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 20 deletions.
2 changes: 1 addition & 1 deletion circle.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ version: 2
jobs:
build:
docker:
- image: golang:1.16.7
- image: golang:1.17.1
working_directory: /gopath/src/github.com/go-ego/gse
steps:
- checkout
Expand Down
52 changes: 38 additions & 14 deletions dict_1.16.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//go:build go1.16
// +build go1.16

package gse
Expand All @@ -19,29 +20,32 @@ func NewEmbed(dict ...string) (seg Segmenter, err error) {
seg.AlphaNum = true
}

err = seg.LoadDictEmbed(dict...)
return
}

// LoadDictEmbed load dictionary by embed file
func (seg *Segmenter) LoadDictEmbed(dict ...string) (err error) {
if len(dict) > 0 {
d := dict[0]
if strings.Contains(d, "zh,") {
if strings.Contains(d, ", ") {
begin := 0
s := strings.Split(d, ", ")
err = seg.LoadDictEmbed()
if err != nil {
return
if strings.Contains(d, "zh,") {
begin = 1
err = seg.LoadDictStr(dataDict)
}

err = seg.LoadDictStr(s[1])
for i := begin; i < len(s); i++ {
err = seg.LoadDictStr(s[i])
}
return
}

err = seg.LoadDictStr(d)
return
}

err = seg.LoadDictEmbed()
return
}

// LoadDictEmbed load dictionary by embed file
func (seg *Segmenter) LoadDictEmbed() error {
return seg.LoadDictStr(dataDict)
}

Expand All @@ -59,11 +63,11 @@ func (seg *Segmenter) LoadDictStr(dict string) error {
if size == 0 {
continue
}
text := strings.Trim(s1[0], " ")
text := strings.TrimSpace(s1[0])

freqText := ""
if len(s1) > 1 {
freqText = s1[1]
freqText = strings.TrimSpace(s1[1])
}

frequency := seg.Size(size, text, freqText)
Expand All @@ -87,7 +91,27 @@ func (seg *Segmenter) LoadDictStr(dict string) error {
}

// LoadStopEmbed load stop dictionary from embed file
func (seg *Segmenter) LoadStopEmbed() error {
func (seg *Segmenter) LoadStopEmbed(dict ...string) (err error) {
if len(dict) > 0 {
d := dict[0]
if strings.Contains(d, ", ") {
begin := 0
s := strings.Split(d, ", ")
if strings.Contains(d, "zh,") {
begin = 1
err = seg.LoadStopStr(stopDict)
}

for i := begin; i < len(s); i++ {
err = seg.LoadStopStr(s[i])
}
return
}

err = seg.LoadStopStr(d)
return
}

return seg.LoadStopStr(stopDict)
}

Expand Down
37 changes: 32 additions & 5 deletions dict_1.16_test.go
Original file line number Diff line number Diff line change
@@ -1,39 +1,66 @@
//go:build go1.16
// +build go1.16

package gse

import (
_ "embed"
"testing"

"github.com/vcaesar/tt"
)

//go:embed testdata/test_dict3.txt
var testDict string

//go:embed testdata/test_dict2.txt
var testDict2 string

//go:embed testdata/stop.txt
var testStop string

func TestLoadDictEmbed(t *testing.T) {
// var seg1 Segmenter
// err := seg1.LoadDictEmbed()
// tt.Nil(t, err)
var seg2 Segmenter
err := seg2.LoadDictEmbed(testDict)
tt.Nil(t, err)

seg1, err := NewEmbed("zh, world 20 n", "en")
seg1, err := NewEmbed("zh, word1 20 n, "+testDict+", "+testDict2, "en")
tt.Nil(t, err)

f, pos, ok := seg1.Find("1号店")
tt.Bool(t, ok)
tt.Equal(t, "n", pos)
tt.Equal(t, 3, f)

f, pos, ok = seg1.Find("hello")
tt.Bool(t, ok)
tt.Equal(t, "", pos)
tt.Equal(t, 20, f)

f, pos, ok = seg1.Find("world")
tt.Bool(t, ok)
tt.Equal(t, "n", pos)
tt.Equal(t, 20, f)

f, pos, ok = seg1.Find("word1")
tt.Bool(t, ok)
tt.Equal(t, "n", pos)
tt.Equal(t, 20, f)

f, pos, ok = seg1.Find("新星共和国")
tt.Bool(t, ok)
tt.Equal(t, "ns", pos)
tt.Equal(t, 32, f)

f, _, ok = seg1.Find("八千一百三十七万七千二百三十六口")
tt.Bool(t, ok)
tt.Equal(t, 2, f)
}

func TestLoadStopEmbed(t *testing.T) {
var seg1 Segmenter
err := seg1.LoadStopEmbed()
err := seg1.LoadStopEmbed("zh, " + testStop)
tt.Nil(t, err)
tt.Bool(t, seg1.IsStop("比如"))
tt.Bool(t, seg1.IsStop("离开"))
}

0 comments on commit 30fc7f1

Please sign in to comment.