Skip to content

Commit

Permalink
Merge pull request #83 from go-ego/range-pr
Browse files Browse the repository at this point in the history
add Num option support and  use seg.AlphaNum
  • Loading branch information
vcaesar authored Dec 31, 2020
2 parents 2eecf60 + e846783 commit 9538046
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 25 deletions.
10 changes: 3 additions & 7 deletions dict_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,6 @@ import (
)

var (
// AlphaNum set splitTextToWords can add token
// when words in alphanum
// set up alphanum dictionary word segmentation
AlphaNum = false

// ToLower set alpha tolower
ToLower = true
Expand Down Expand Up @@ -61,7 +57,7 @@ func (seg *Segmenter) AddToken(text string, frequency float64, pos ...string) er
po = pos[0]
}

words := SplitTextToWords([]byte(text))
words := seg.SplitTextToWords([]byte(text))
token := Token{text: words, frequency: frequency, pos: po}

return seg.Dict.addToken(token)
Expand All @@ -76,7 +72,7 @@ func (seg *Segmenter) AddTokenForce(text string, frequency float64, pos ...strin

// RemoveToken remove token in dictionary
func (seg *Segmenter) RemoveToken(text string) error {
words := SplitTextToWords([]byte(text))
words := seg.SplitTextToWords([]byte(text))
token := Token{text: words}

return seg.Dict.RemoveToken(token)
Expand Down Expand Up @@ -261,7 +257,7 @@ func (seg *Segmenter) Read(file string) error {
}

// 将分词添加到字典中
words := SplitTextToWords([]byte(text))
words := seg.SplitTextToWords([]byte(text))
token := Token{text: words, frequency: frequency, pos: pos}
seg.Dict.addToken(token)
}
Expand Down
2 changes: 1 addition & 1 deletion gse.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ type Prob struct {
func New(files ...string) Segmenter {
var seg Segmenter
if len(files) > 1 && files[1] == "alpha" {
AlphaNum = true
seg.AlphaNum = true
}
seg.LoadDict(files...)

Expand Down
14 changes: 14 additions & 0 deletions gse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,3 +187,17 @@ func TestStop(t *testing.T) {
s = RangeText("hibot, 机器人")
tt.Equal(t, "h i b o t , 机 器 人 ", s)
}

func TestNum(t *testing.T) {
seg := New("./testdata/test_dict3.txt")
seg.Num = true
text := "t123test123 num123-1"
s := seg.Cut(text)
tt.Equal(t, "[t 1 2 3 test 1 2 3 num 1 2 3 - 1]", s)

s = seg.CutAll(text)
tt.Equal(t, "[t 1 2 3 t e s t 1 2 3 n u m 1 2 3 - 1]", s)

s = seg.CutSearch(text)
tt.Equal(t, "[t 1 2 3 test 1 2 3 num 1 2 3 - 1]", s)
}
24 changes: 16 additions & 8 deletions segmenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,14 @@ type Segmenter struct {
Dict *Dictionary
Load bool

// AlphaNum set splitTextToWords can add token
// when words in alphanum
// set up alphanum dictionary word segmentation
AlphaNum bool
Num bool
// ToLower set alpha tolower
// ToLower bool

// LoadNoFreq load not have freq dict word
LoadNoFreq bool
// MinTokenFreq load min freq token
Expand Down Expand Up @@ -78,7 +86,7 @@ func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment {
}

// 划分字元
text := SplitTextToWords(bytes)
text := seg.SplitTextToWords(bytes)

return seg.segmentWords(text, searchMode)
}
Expand Down Expand Up @@ -169,28 +177,28 @@ func updateJumper(jumper *jumper, baseDistance float32, token *Token) {
}

// SplitTextToWords 将文本划分成字元
func SplitTextToWords(text Text) []Text {
func (seg *Segmenter) SplitTextToWords(text Text) []Text {
output := make([]Text, 0, len(text)/3)
current := 0
current, alphanumericStart := 0, 0
inAlphanumeric := true
alphanumericStart := 0

for current < len(text) {
r, size := utf8.DecodeRune(text[current:])
if size <= 2 && (unicode.IsLetter(r) || unicode.IsNumber(r)) {
isNum := unicode.IsNumber(r) && !seg.Num
if size <= 2 && (unicode.IsLetter(r) || isNum) {
// 当前是拉丁字母或数字(非中日韩文字)
if !inAlphanumeric {
alphanumericStart = current
inAlphanumeric = true
}

if AlphaNum {
if seg.AlphaNum {
output = append(output, toLow(text[current:current+size]))
}
} else {
if inAlphanumeric {
inAlphanumeric = false
if current != 0 && !AlphaNum {
if current != 0 && !seg.AlphaNum {
output = append(output, toLow(text[alphanumericStart:current]))
}
}
Expand All @@ -201,7 +209,7 @@ func SplitTextToWords(text Text) []Text {
}

// 处理最后一个字元是英文的情况
if inAlphanumeric && !AlphaNum {
if inAlphanumeric && !seg.AlphaNum {
if current != 0 {
output = append(output, toLow(text[alphanumericStart:current]))
}
Expand Down
19 changes: 10 additions & 9 deletions segmenter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,34 +25,35 @@ func TestGetVer(t *testing.T) {
}

func TestSplit(t *testing.T) {
var seg1 Segmenter
tt.Expect(t, "世/界/有/七/十/亿/人/口/",
bytesToString(SplitTextToWords([]byte("世界有七十亿人口"))))
bytesToString(seg1.SplitTextToWords([]byte("世界有七十亿人口"))))

tt.Expect(t, "github/ /is/ /a/ /web/-/based/ /hosting/ /service/,/ /for/ /software/ /development/ /projects/./",
bytesToString(SplitTextToWords([]byte(
bytesToString(seg1.SplitTextToWords([]byte(
"GitHub is a web-based hosting service, for software development projects."))))

tt.Expect(t, "雅/虎/yahoo/!/ /致/力/于/,/领/先/的/门/户/网/站/。/",
bytesToString(SplitTextToWords([]byte(
bytesToString(seg1.SplitTextToWords([]byte(
"雅虎Yahoo! 致力于,领先的门户网站。"))))

tt.Expect(t, "こ/ん/に/ち/は/",
bytesToString(SplitTextToWords([]byte("こんにちは"))))
bytesToString(seg1.SplitTextToWords([]byte("こんにちは"))))

tt.Expect(t, "안/녕/하/세/요/",
bytesToString(SplitTextToWords([]byte("안녕하세요"))))
bytesToString(seg1.SplitTextToWords([]byte("안녕하세요"))))

tt.Expect(t, "Я/ /тоже/ /рада/ /Вас/ /видеть/",
bytesToString(SplitTextToWords([]byte("Я тоже рада Вас видеть"))))
bytesToString(seg1.SplitTextToWords([]byte("Я тоже рада Вас видеть"))))

tt.Expect(t, "¿/cómo/ /van/ /las/ /cosas/",
bytesToString(SplitTextToWords([]byte("¿Cómo van las cosas"))))
bytesToString(seg1.SplitTextToWords([]byte("¿Cómo van las cosas"))))

tt.Expect(t, "wie/ /geht/ /es/ /ihnen/",
bytesToString(SplitTextToWords([]byte("Wie geht es Ihnen"))))
bytesToString(seg1.SplitTextToWords([]byte("Wie geht es Ihnen"))))

tt.Expect(t, "je/ /suis/ /enchanté/ /de/ /cette/ /pièce/",
bytesToString(SplitTextToWords([]byte("Je suis enchanté de cette pièce"))))
bytesToString(seg1.SplitTextToWords([]byte("Je suis enchanté de cette pièce"))))

tt.Expect(t, "[[116 111 32 119 111 114 100 115]]", toWords("to words"))
}
Expand Down

0 comments on commit 9538046

Please sign in to comment.