diff --git a/gse_test.go b/gse_test.go index b6f5195..532ecae 100644 --- a/gse_test.go +++ b/gse_test.go @@ -201,3 +201,10 @@ func TestNum(t *testing.T) { s = seg.CutSearch(text) tt.Equal(t, "[t 1 2 3 test 1 2 3 num 1 2 3 - 1]", s) } + +func TestUrl(t *testing.T) { + seg := New("./testdata/test_dict3.txt") + + s1 := seg.CutUrls("https://www.g.com/search?q=test%m11.42&ie=UTF-8") + tt.Equal(t, "https www g com search q test m 11 42 ie utf 8", s1) +} diff --git a/trim.go b/trim.go index 873663f..41e730b 100644 --- a/trim.go +++ b/trim.go @@ -16,6 +16,7 @@ package gse import ( "regexp" + "strings" "unicode" "unicode/utf8" ) @@ -169,6 +170,30 @@ func (seg *Segmenter) CutTrimHtmls(str string, hmm ...bool) string { return seg.CutStr(s, " ") } +// CutUrl cut url string trim symbol return []string +func (seg *Segmenter) CutUrl(str string) []string { + // seg.Num = true + str = SplitNums(str) + s := seg.Cut(str) + return seg.TrimSymbol(s) +} + +// CutUrls cut url string trim symbol return string +func (seg *Segmenter) CutUrls(str string) string { + return seg.CutStr(seg.CutUrl(str), " ") +} + +// SplitNum cut string by num to []string +func SplitNum(text string) []string { + r := regexp.MustCompile(`\d+|\D+`) + return r.FindAllString(text, -1) +} + +// SplitNums cut string by num to string +func SplitNums(text string) string { + return strings.Join(SplitNum(text), " ") +} + // FilterEmoji filter the emoji func FilterEmoji(text string) (new string) { for _, value := range text {