diff --git a/README.md b/README.md index 545fd7a..3d4775b 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ To see all options, use`uno -h` ### -d The distance option `-d` can be used to specify how different a new line must be from the others we've seen to be deemed -new/unique. It can take a value between `0` and `1`. The default is `0.3` (30% difference) +new/unique. It can take a value between `0` and `1`. The default is `0.2` (20% difference) ### -all @@ -74,3 +74,18 @@ To see all input lines and highlight the new ones in red, use `-all` cat my_log_file.txt | uno -all uno -all my_log_file.txt ``` + +### -p + +Output log patterns (numbers are replaced by `*`) + +```bash +cat my_log_file.txt | uno -p +uno -all my_log_file.txt + +Jun * *:*:* combo ftpd[*]: connection from * (*-*-*-*.bflony.adelphia.net) at Fri Jun * *:*:* * +Jun * *:*:* combo cups: cupsd shutdown succeeded +Jul * *:*:* combo gpm[*]: *** info [mice.c(*)]: +Jul * *:*:* combo gpm[*]: imps2: Auto-detected intellimouse PS/* + +``` diff --git a/go.mod b/go.mod index 9d108c0..ee72979 100644 --- a/go.mod +++ b/go.mod @@ -2,11 +2,14 @@ module github.com/psykhi/uno go 1.18 -require github.com/stretchr/testify v1.8.0 +require ( + github.com/blevesearch/segment v0.9.0 + github.com/fatih/color v1.13.0 + github.com/stretchr/testify v1.8.0 +) require ( github.com/davecgh/go-spew v1.1.1 // indirect - github.com/fatih/color v1.13.0 // indirect github.com/mattn/go-colorable v0.1.9 // indirect github.com/mattn/go-isatty v0.0.14 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect diff --git a/go.sum b/go.sum index 9456103..5ff2a20 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,5 @@ +github.com/blevesearch/segment v0.9.0 h1:5lG7yBCx98or7gK2cHMKPukPZ/31Kag7nONpoBt22Ac= +github.com/blevesearch/segment v0.9.0/go.mod h1:9PfHYUdQCgHktBgvtUOF4x+pc4/l8rdH0u5spnW85UQ= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= diff --git a/main.go b/main.go index ba886b1..069e310 100644 --- a/main.go +++ b/main.go @@ -13,7 +13,9 @@ import ( func main() { printAll := flag.Bool("all", false, "Print all lines and highlight new lines in red (if the terminal supports it)") - maxDiffRatio := flag.Float64("d", 0.3, "The maximum difference ratio between the input line and the other lines seen (between 0 and 1)") + maxDiffRatio := flag.Float64("d", 0.2, "The maximum difference ratio between the input line and the other lines seen (between 0 and 1)") + patterns := flag.Bool("p", false, "Show log patterns") + flag.Parse() if *maxDiffRatio < 0 || *maxDiffRatio > 1 { @@ -42,16 +44,20 @@ func main() { l.Input = scanner.Bytes() l.IsNew = false l = p.Process(l) + toPrint := l.Input + if *patterns { + toPrint = []byte(strings.Join(l.Tokens, "")) + } if l.IsNew { if *printAll { - color.Red("%s", l.Input) + color.Red("%s", toPrint) } else { - fmt.Printf("%s\n", l.Input) + fmt.Printf("%s\n", toPrint) } continue } if *printAll { - fmt.Printf("%s\n", l.Input) + fmt.Printf("%s\n", toPrint) } } diff --git a/pkg/levenshtein/levenshtein.go b/pkg/levenshtein/levenshtein.go index 14ebc8d..7a26958 100644 --- a/pkg/levenshtein/levenshtein.go +++ b/pkg/levenshtein/levenshtein.go @@ -2,44 +2,6 @@ package levenshtein import "math" -// Compute Levenshtein distance on byte arrays -// https://en.wikipedia.org/wiki/Levenshtein_distance?section=9#Iterative_with_two_matrix_rows -// v0 and v1 buffers can be provided in order to reuse them and avoid allocations -func LevenshteinDistance(a []byte, b []byte, v0 []int, v1 []int) int { - m := len(a) - n := len(b) - - if v0 == nil { - v0 = make([]int, n+1, n+1) - } - if v1 == nil { - v1 = make([]int, n+1, n+1) - } - - for i := 0; i < n+1; i++ { - v0[i] = i - } - - for i := 0; i < m; i++ { - v1[0] = i + 1 - - for j := 0; j < n; j++ { - substitutionCost := 0 - if a[i] == b[j] { - substitutionCost = 0 - } else { - substitutionCost = 1 - } - v1[j+1] = min3(v1[j]+1, v0[j+1]+1, v0[j]+substitutionCost) - } - temp := v0 - v0 = v1 - v1 = temp - } - - return v0[n] -} - func min(a int, b int) int { if a < b { return a @@ -54,80 +16,10 @@ func max(a int, b int) int { return b } -func LevenshteinDistanceK(a []byte, b []byte, v0 []int, v1 []int, k int) int { - m := len(a) - n := len(b) - - if n == 0 { - if m <= k { - return m - } - return -1 - } else if m == 0 { - if n <= k { - return n - } - } - - if n > m { - n, m = m, n - a, b = b, a - } - - if v0 == nil { - v0 = make([]int, n+1, n+1) - } - if v1 == nil { - v1 = make([]int, n+1, n+1) - } - - boundary := min(n, k) + 1 - for i := 0; i < boundary; i++ { - v0[i] = i - } - for i := boundary; i < n+1; i++ { - v0[i] = math.MaxInt32 - } - for i := 0; i < n+1; i++ { - v1[i] = math.MaxInt32 - } - - for i := 1; i <= m; i++ { - v1[0] = i + 1 - - minStripe := max(1, i-k) - - max := min(n, i+k) - - if i > math.MaxInt32-k { - max = n - } - - if minStripe > max { - return -1 - } - if minStripe > 1 { - v1[minStripe-1] = math.MaxInt32 - } - - for j := minStripe; j <= max; j++ { - if b[j-1] == a[i-1] { - v1[j] = v0[j-1] - } else { - v1[j] = 1 + min3(v1[j-1], v0[j], v0[j-1]) - } - - } - v0, v1 = v1, v0 - } - - if v0[n] <= k { - return v0[n] - } - return -1 -} - -func LevenshteinDistanceKStrings(a []string, b []string, v0 []int, v1 []int, k int) int { +// Compute Levenshtein distance on byte arrays +// https://en.wikipedia.org/wiki/Levenshtein_distance?section=9#Iterative_with_two_matrix_rows +// v0 and v1 buffers can be provided in order to reuse them and avoid allocations +func LevenshteinDistanceK[token comparable](a []token, b []token, v0 []int, v1 []int, k int) int { m := len(a) n := len(b) @@ -200,44 +92,6 @@ func LevenshteinDistanceKStrings(a []string, b []string, v0 []int, v1 []int, k i return -1 } -// Compute Levenshtein distance on strings sequences -// https://en.wikipedia.org/wiki/Levenshtein_distance?section=9#Iterative_with_two_matrix_rows -// v0 and v1 buffers can be provided in order to reuse them and avoid allocations -func LevenshteinDistanceStrings(a []string, b []string, v0 []int, v1 []int) int { - m := len(a) - n := len(b) - - if v0 == nil { - v0 = make([]int, n+1, n+1) - } - if v1 == nil { - v1 = make([]int, n+1, n+1) - } - - for i := 0; i < n+1; i++ { - v0[i] = i - } - - for i := 0; i < m; i++ { - v1[0] = i + 1 - - for j := 0; j < n; j++ { - substitutionCost := 0 - if a[i] == b[j] { - substitutionCost = 0 - } else { - substitutionCost = 1 - } - v1[j+1] = min3(v1[j]+1, v0[j+1]+1, v0[j]+substitutionCost) - } - temp := v0 - v0 = v1 - v1 = temp - } - - return v0[n] -} - func min3(a, b, c int) int { if b < c { if b < a { diff --git a/pkg/levenshtein/levenshtein_test.go b/pkg/levenshtein/levenshtein_test.go index 9d17bc8..26c97de 100644 --- a/pkg/levenshtein/levenshtein_test.go +++ b/pkg/levenshtein/levenshtein_test.go @@ -5,18 +5,6 @@ import ( "testing" ) -type seq struct { - chars []byte -} - -func (s *seq) Val(i int) interface{} { - return s.chars[i] -} - -func (s *seq) Len() int { - return len(s.chars) -} - func TestDistanceK(t *testing.T) { d := LevenshteinDistanceK([]byte("kitten"), []byte("sitting"), nil, nil, 3) assert.Equal(t, 3, d) @@ -29,41 +17,12 @@ func TestDistanceK(t *testing.T) { d = LevenshteinDistanceK([]byte("elephant"), []byte(""), nil, nil, 100) assert.Equal(t, 8, d) -} -func TestDistance(t *testing.T) { - d := LevenshteinDistance([]byte("kitten"), []byte("sitting"), nil, nil) - assert.Equal(t, 3, d) + d = LevenshteinDistanceK([]string{"hello", "world"}, []string{"hello", "earth"}, nil, nil, 100) + assert.Equal(t, 1, d) } func BenchmarkLevenshteinDistance(b *testing.B) { - b.Run("6 char string", func(b *testing.B) { - x := []byte("ABCDEF") - y := []byte("ABCCDEF") - - b.ReportAllocs() - for i := 0; i < b.N; i++ { - LevenshteinDistance(x, y, nil, nil) - } - }) - b.Run("25-30 char string", func(b *testing.B) { - x := []byte("This is a longer string") - y := []byte("This is a much longer string") - - b.ReportAllocs() - for i := 0; i < b.N; i++ { - LevenshteinDistance(x, y, nil, nil) - } - }) - b.Run("Long log line", func(b *testing.B) { - x := []byte("10__8__0__146 kernel process Google Chrome Ca[3955] caught causing excessive wakeups. Observed wakeups rate (per sec): 392; Maximum permitted wakeups rate (per sec): 150; Observation period: 300 seconds; Task lifetime number of wakeups: 317314") - y := []byte("10__8__0__146 kernel process Sublime Text[802] caught causing excessive wakeups. Observed wakeups rate (per sec): 233; Maximum permitted wakeups rate (per sec): 150; Observation period: 300 seconds; Task lifetime number of wakeups: 95333") - - b.ReportAllocs() - for i := 0; i < b.N; i++ { - LevenshteinDistance(x, y, nil, nil) - } - }) b.Run("Long log line K bound", func(b *testing.B) { x := []byte("10__8__0__146 kernel process Google Chrome Ca[3955] caught causing excessive wakeups. Observed wakeups rate (per sec): 392; Maximum permitted wakeups rate (per sec): 150; Observation period: 300 seconds; Task lifetime number of wakeups: 317314") y := []byte("10__8__0__146 kernel process Sublime Text[802] caught causing excessive wakeups. Observed wakeups rate (per sec): 233; Maximum permitted wakeups rate (per sec): 150; Observation period: 300 seconds; Task lifetime number of wakeups: 95333") @@ -73,16 +32,5 @@ func BenchmarkLevenshteinDistance(b *testing.B) { LevenshteinDistanceK(x, y, nil, nil, len(x)/4) } }) - b.Run("Long log line buffer reuse", func(b *testing.B) { - x := []byte("10__8__0__146 kernel process Google Chrome Ca[3955] caught causing excessive wakeups. Observed wakeups rate (per sec): 392; Maximum permitted wakeups rate (per sec): 150; Observation period: 300 seconds; Task lifetime number of wakeups: 317314") - y := []byte("10__8__0__146 kernel process Sublime Text[802] caught causing excessive wakeups. Observed wakeups rate (per sec): 233; Maximum permitted wakeups rate (per sec): 150; Observation period: 300 seconds; Task lifetime number of wakeups: 95333") - n := len(y) - v0 := make([]int, n+1, n+1) - v1 := make([]int, n+1, n+1) - b.ReportAllocs() - for i := 0; i < b.N; i++ { - LevenshteinDistance(x, y, v0, v1) - } - }) } diff --git a/pkg/processor/levenshtein.go b/pkg/processor/levenshtein.go index 0f55baa..0957086 100644 --- a/pkg/processor/levenshtein.go +++ b/pkg/processor/levenshtein.go @@ -6,25 +6,25 @@ import ( ) type levenshtein struct { - seen [][]byte + seen [][]string maxDiffRatio float64 } func newLevenshtein(maxDiffRatio float64) *levenshtein { - seen := make([][]byte, 0) + seen := make([][]string, 0) return &levenshtein{seen: seen, maxDiffRatio: maxDiffRatio} } func (le *levenshtein) process(in Line) Line { in.IsNew = true for _, l := range le.seen { - maxDiff := int(math.Ceil(float64(len(in.Input)) * le.maxDiffRatio)) - d := levenshtein2.LevenshteinDistanceK(in.Input, l, nil, nil, maxDiff) + maxDiff := int(math.Ceil(float64(len(in.Tokens)) * le.maxDiffRatio)) + d := levenshtein2.LevenshteinDistanceK(in.Tokens, l, nil, nil, maxDiff) if d <= maxDiff && d >= 0 { in.IsNew = false return in } } - le.seen = append(le.seen, in.Input) + le.seen = append(le.seen, in.Tokens) return in } diff --git a/pkg/processor/processor.go b/pkg/processor/processor.go index b235181..85741d6 100644 --- a/pkg/processor/processor.go +++ b/pkg/processor/processor.go @@ -1,12 +1,15 @@ package processor +import "github.com/blevesearch/segment" + type Processor struct { le *levenshtein } type Line struct { - Input []byte - IsNew bool + Input []byte + Tokens []string + IsNew bool } func NewProcessor(maxDiffRatio float64) *Processor { @@ -15,6 +18,16 @@ func NewProcessor(maxDiffRatio float64) *Processor { } func (p *Processor) Process(in Line) Line { + s := segment.NewSegmenterDirect(in.Input) + in.Tokens = make([]string, 0) + for s.Segment() { + t := s.Text() + // Number tokens will always be considered equal, a simple way to ignore timestamps, ids etc. + if s.Type() == segment.Number { + t = "*" + } + in.Tokens = append(in.Tokens, t) + } in = p.le.process(in) return in }