diff --git a/dict_util.go b/dict_util.go index 344c776..ffb62e2 100644 --- a/dict_util.go +++ b/dict_util.go @@ -29,10 +29,6 @@ import ( ) var ( - // AlphaNum set splitTextToWords can add token - // when words in alphanum - // set up alphanum dictionary word segmentation - AlphaNum = false // ToLower set alpha tolower ToLower = true @@ -61,7 +57,7 @@ func (seg *Segmenter) AddToken(text string, frequency float64, pos ...string) er po = pos[0] } - words := SplitTextToWords([]byte(text)) + words := seg.SplitTextToWords([]byte(text)) token := Token{text: words, frequency: frequency, pos: po} return seg.Dict.addToken(token) @@ -76,7 +72,7 @@ func (seg *Segmenter) AddTokenForce(text string, frequency float64, pos ...strin // RemoveToken remove token in dictionary func (seg *Segmenter) RemoveToken(text string) error { - words := SplitTextToWords([]byte(text)) + words := seg.SplitTextToWords([]byte(text)) token := Token{text: words} return seg.Dict.RemoveToken(token) @@ -261,7 +257,7 @@ func (seg *Segmenter) Read(file string) error { } // 将分词添加到字典中 - words := SplitTextToWords([]byte(text)) + words := seg.SplitTextToWords([]byte(text)) token := Token{text: words, frequency: frequency, pos: pos} seg.Dict.addToken(token) } diff --git a/gse.go b/gse.go index b2965b8..72734d3 100644 --- a/gse.go +++ b/gse.go @@ -47,7 +47,7 @@ type Prob struct { func New(files ...string) Segmenter { var seg Segmenter if len(files) > 1 && files[1] == "alpha" { - AlphaNum = true + seg.AlphaNum = true } seg.LoadDict(files...) diff --git a/gse_test.go b/gse_test.go index 5697504..b6f5195 100644 --- a/gse_test.go +++ b/gse_test.go @@ -187,3 +187,17 @@ func TestStop(t *testing.T) { s = RangeText("hibot, 机器人") tt.Equal(t, "h i b o t , 机 器 人 ", s) } + +func TestNum(t *testing.T) { + seg := New("./testdata/test_dict3.txt") + seg.Num = true + text := "t123test123 num123-1" + s := seg.Cut(text) + tt.Equal(t, "[t 1 2 3 test 1 2 3 num 1 2 3 - 1]", s) + + s = seg.CutAll(text) + tt.Equal(t, "[t 1 2 3 t e s t 1 2 3 n u m 1 2 3 - 1]", s) + + s = seg.CutSearch(text) + tt.Equal(t, "[t 1 2 3 test 1 2 3 num 1 2 3 - 1]", s) +} diff --git a/segmenter.go b/segmenter.go index 1751bba..13df832 100755 --- a/segmenter.go +++ b/segmenter.go @@ -25,6 +25,14 @@ type Segmenter struct { Dict *Dictionary Load bool + // AlphaNum set splitTextToWords can add token + // when words in alphanum + // set up alphanum dictionary word segmentation + AlphaNum bool + Num bool + // ToLower set alpha tolower + // ToLower bool + // LoadNoFreq load not have freq dict word LoadNoFreq bool // MinTokenFreq load min freq token @@ -78,7 +86,7 @@ func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment { } // 划分字元 - text := SplitTextToWords(bytes) + text := seg.SplitTextToWords(bytes) return seg.segmentWords(text, searchMode) } @@ -169,28 +177,28 @@ func updateJumper(jumper *jumper, baseDistance float32, token *Token) { } // SplitTextToWords 将文本划分成字元 -func SplitTextToWords(text Text) []Text { +func (seg *Segmenter) SplitTextToWords(text Text) []Text { output := make([]Text, 0, len(text)/3) - current := 0 + current, alphanumericStart := 0, 0 inAlphanumeric := true - alphanumericStart := 0 for current < len(text) { r, size := utf8.DecodeRune(text[current:]) - if size <= 2 && (unicode.IsLetter(r) || unicode.IsNumber(r)) { + isNum := unicode.IsNumber(r) && !seg.Num + if size <= 2 && (unicode.IsLetter(r) || isNum) { // 当前是拉丁字母或数字(非中日韩文字) if !inAlphanumeric { alphanumericStart = current inAlphanumeric = true } - if AlphaNum { + if seg.AlphaNum { output = append(output, toLow(text[current:current+size])) } } else { if inAlphanumeric { inAlphanumeric = false - if current != 0 && !AlphaNum { + if current != 0 && !seg.AlphaNum { output = append(output, toLow(text[alphanumericStart:current])) } } @@ -201,7 +209,7 @@ func SplitTextToWords(text Text) []Text { } // 处理最后一个字元是英文的情况 - if inAlphanumeric && !AlphaNum { + if inAlphanumeric && !seg.AlphaNum { if current != 0 { output = append(output, toLow(text[alphanumericStart:current])) } diff --git a/segmenter_test.go b/segmenter_test.go index 2276d4e..5e8112a 100755 --- a/segmenter_test.go +++ b/segmenter_test.go @@ -25,34 +25,35 @@ func TestGetVer(t *testing.T) { } func TestSplit(t *testing.T) { + var seg1 Segmenter tt.Expect(t, "世/界/有/七/十/亿/人/口/", - bytesToString(SplitTextToWords([]byte("世界有七十亿人口")))) + bytesToString(seg1.SplitTextToWords([]byte("世界有七十亿人口")))) tt.Expect(t, "github/ /is/ /a/ /web/-/based/ /hosting/ /service/,/ /for/ /software/ /development/ /projects/./", - bytesToString(SplitTextToWords([]byte( + bytesToString(seg1.SplitTextToWords([]byte( "GitHub is a web-based hosting service, for software development projects.")))) tt.Expect(t, "雅/虎/yahoo/!/ /致/力/于/,/领/先/的/门/户/网/站/。/", - bytesToString(SplitTextToWords([]byte( + bytesToString(seg1.SplitTextToWords([]byte( "雅虎Yahoo! 致力于,领先的门户网站。")))) tt.Expect(t, "こ/ん/に/ち/は/", - bytesToString(SplitTextToWords([]byte("こんにちは")))) + bytesToString(seg1.SplitTextToWords([]byte("こんにちは")))) tt.Expect(t, "안/녕/하/세/요/", - bytesToString(SplitTextToWords([]byte("안녕하세요")))) + bytesToString(seg1.SplitTextToWords([]byte("안녕하세요")))) tt.Expect(t, "Я/ /тоже/ /рада/ /Вас/ /видеть/", - bytesToString(SplitTextToWords([]byte("Я тоже рада Вас видеть")))) + bytesToString(seg1.SplitTextToWords([]byte("Я тоже рада Вас видеть")))) tt.Expect(t, "¿/cómo/ /van/ /las/ /cosas/", - bytesToString(SplitTextToWords([]byte("¿Cómo van las cosas")))) + bytesToString(seg1.SplitTextToWords([]byte("¿Cómo van las cosas")))) tt.Expect(t, "wie/ /geht/ /es/ /ihnen/", - bytesToString(SplitTextToWords([]byte("Wie geht es Ihnen")))) + bytesToString(seg1.SplitTextToWords([]byte("Wie geht es Ihnen")))) tt.Expect(t, "je/ /suis/ /enchanté/ /de/ /cette/ /pièce/", - bytesToString(SplitTextToWords([]byte("Je suis enchanté de cette pièce")))) + bytesToString(seg1.SplitTextToWords([]byte("Je suis enchanté de cette pièce")))) tt.Expect(t, "[[116 111 32 119 111 114 100 115]]", toWords("to words")) }