Skip to content

Commit

Permalink
Diff tokenized lines, much faster
Browse files Browse the repository at this point in the history
  • Loading branch information
psykhi committed Jul 30, 2022
1 parent 74c570b commit 67148c5
Show file tree
Hide file tree
Showing 8 changed files with 59 additions and 218 deletions.
17 changes: 16 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ To see all options, use`uno -h`
### -d

The distance option `-d` can be used to specify how different a new line must be from the others we've seen to be deemed
new/unique. It can take a value between `0` and `1`. The default is `0.3` (30% difference)
new/unique. It can take a value between `0` and `1`. The default is `0.2` (20% difference)

### -all

Expand All @@ -74,3 +74,18 @@ To see all input lines and highlight the new ones in red, use `-all`
cat my_log_file.txt | uno -all
uno -all my_log_file.txt
```

### -p

Output log patterns (numbers are replaced by `*`)

```bash
cat my_log_file.txt | uno -p
uno -all my_log_file.txt

Jun * *:*:* combo ftpd[*]: connection from * (*-*-*-*.bflony.adelphia.net) at Fri Jun * *:*:* *
Jun * *:*:* combo cups: cupsd shutdown succeeded
Jul * *:*:* combo gpm[*]: *** info [mice.c(*)]:
Jul * *:*:* combo gpm[*]: imps2: Auto-detected intellimouse PS/*

```
7 changes: 5 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@ module github.com/psykhi/uno

go 1.18

require github.com/stretchr/testify v1.8.0
require (
github.com/blevesearch/segment v0.9.0
github.com/fatih/color v1.13.0
github.com/stretchr/testify v1.8.0
)

require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/fatih/color v1.13.0 // indirect
github.com/mattn/go-colorable v0.1.9 // indirect
github.com/mattn/go-isatty v0.0.14 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
github.com/blevesearch/segment v0.9.0 h1:5lG7yBCx98or7gK2cHMKPukPZ/31Kag7nONpoBt22Ac=
github.com/blevesearch/segment v0.9.0/go.mod h1:9PfHYUdQCgHktBgvtUOF4x+pc4/l8rdH0u5spnW85UQ=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
Expand Down
14 changes: 10 additions & 4 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ import (

func main() {
printAll := flag.Bool("all", false, "Print all lines and highlight new lines in red (if the terminal supports it)")
maxDiffRatio := flag.Float64("d", 0.3, "The maximum difference ratio between the input line and the other lines seen (between 0 and 1)")
maxDiffRatio := flag.Float64("d", 0.2, "The maximum difference ratio between the input line and the other lines seen (between 0 and 1)")
patterns := flag.Bool("p", false, "Show log patterns")

flag.Parse()

if *maxDiffRatio < 0 || *maxDiffRatio > 1 {
Expand Down Expand Up @@ -42,16 +44,20 @@ func main() {
l.Input = scanner.Bytes()
l.IsNew = false
l = p.Process(l)
toPrint := l.Input
if *patterns {
toPrint = []byte(strings.Join(l.Tokens, ""))
}
if l.IsNew {
if *printAll {
color.Red("%s", l.Input)
color.Red("%s", toPrint)
} else {
fmt.Printf("%s\n", l.Input)
fmt.Printf("%s\n", toPrint)
}
continue
}
if *printAll {
fmt.Printf("%s\n", l.Input)
fmt.Printf("%s\n", toPrint)
}
}

Expand Down
154 changes: 4 additions & 150 deletions pkg/levenshtein/levenshtein.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,44 +2,6 @@ package levenshtein

import "math"

// Compute Levenshtein distance on byte arrays
// https://en.wikipedia.org/wiki/Levenshtein_distance?section=9#Iterative_with_two_matrix_rows
// v0 and v1 buffers can be provided in order to reuse them and avoid allocations
func LevenshteinDistance(a []byte, b []byte, v0 []int, v1 []int) int {
m := len(a)
n := len(b)

if v0 == nil {
v0 = make([]int, n+1, n+1)
}
if v1 == nil {
v1 = make([]int, n+1, n+1)
}

for i := 0; i < n+1; i++ {
v0[i] = i
}

for i := 0; i < m; i++ {
v1[0] = i + 1

for j := 0; j < n; j++ {
substitutionCost := 0
if a[i] == b[j] {
substitutionCost = 0
} else {
substitutionCost = 1
}
v1[j+1] = min3(v1[j]+1, v0[j+1]+1, v0[j]+substitutionCost)
}
temp := v0
v0 = v1
v1 = temp
}

return v0[n]
}

func min(a int, b int) int {
if a < b {
return a
Expand All @@ -54,80 +16,10 @@ func max(a int, b int) int {
return b
}

func LevenshteinDistanceK(a []byte, b []byte, v0 []int, v1 []int, k int) int {
m := len(a)
n := len(b)

if n == 0 {
if m <= k {
return m
}
return -1
} else if m == 0 {
if n <= k {
return n
}
}

if n > m {
n, m = m, n
a, b = b, a
}

if v0 == nil {
v0 = make([]int, n+1, n+1)
}
if v1 == nil {
v1 = make([]int, n+1, n+1)
}

boundary := min(n, k) + 1
for i := 0; i < boundary; i++ {
v0[i] = i
}
for i := boundary; i < n+1; i++ {
v0[i] = math.MaxInt32
}
for i := 0; i < n+1; i++ {
v1[i] = math.MaxInt32
}

for i := 1; i <= m; i++ {
v1[0] = i + 1

minStripe := max(1, i-k)

max := min(n, i+k)

if i > math.MaxInt32-k {
max = n
}

if minStripe > max {
return -1
}
if minStripe > 1 {
v1[minStripe-1] = math.MaxInt32
}

for j := minStripe; j <= max; j++ {
if b[j-1] == a[i-1] {
v1[j] = v0[j-1]
} else {
v1[j] = 1 + min3(v1[j-1], v0[j], v0[j-1])
}

}
v0, v1 = v1, v0
}

if v0[n] <= k {
return v0[n]
}
return -1
}

func LevenshteinDistanceKStrings(a []string, b []string, v0 []int, v1 []int, k int) int {
// Compute Levenshtein distance on byte arrays
// https://en.wikipedia.org/wiki/Levenshtein_distance?section=9#Iterative_with_two_matrix_rows
// v0 and v1 buffers can be provided in order to reuse them and avoid allocations
func LevenshteinDistanceK[token comparable](a []token, b []token, v0 []int, v1 []int, k int) int {
m := len(a)
n := len(b)

Expand Down Expand Up @@ -200,44 +92,6 @@ func LevenshteinDistanceKStrings(a []string, b []string, v0 []int, v1 []int, k i
return -1
}

// Compute Levenshtein distance on strings sequences
// https://en.wikipedia.org/wiki/Levenshtein_distance?section=9#Iterative_with_two_matrix_rows
// v0 and v1 buffers can be provided in order to reuse them and avoid allocations
func LevenshteinDistanceStrings(a []string, b []string, v0 []int, v1 []int) int {
m := len(a)
n := len(b)

if v0 == nil {
v0 = make([]int, n+1, n+1)
}
if v1 == nil {
v1 = make([]int, n+1, n+1)
}

for i := 0; i < n+1; i++ {
v0[i] = i
}

for i := 0; i < m; i++ {
v1[0] = i + 1

for j := 0; j < n; j++ {
substitutionCost := 0
if a[i] == b[j] {
substitutionCost = 0
} else {
substitutionCost = 1
}
v1[j+1] = min3(v1[j]+1, v0[j+1]+1, v0[j]+substitutionCost)
}
temp := v0
v0 = v1
v1 = temp
}

return v0[n]
}

func min3(a, b, c int) int {
if b < c {
if b < a {
Expand Down
56 changes: 2 additions & 54 deletions pkg/levenshtein/levenshtein_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,6 @@ import (
"testing"
)

type seq struct {
chars []byte
}

func (s *seq) Val(i int) interface{} {
return s.chars[i]
}

func (s *seq) Len() int {
return len(s.chars)
}

func TestDistanceK(t *testing.T) {
d := LevenshteinDistanceK([]byte("kitten"), []byte("sitting"), nil, nil, 3)
assert.Equal(t, 3, d)
Expand All @@ -29,41 +17,12 @@ func TestDistanceK(t *testing.T) {

d = LevenshteinDistanceK([]byte("elephant"), []byte(""), nil, nil, 100)
assert.Equal(t, 8, d)
}

func TestDistance(t *testing.T) {
d := LevenshteinDistance([]byte("kitten"), []byte("sitting"), nil, nil)
assert.Equal(t, 3, d)
d = LevenshteinDistanceK([]string{"hello", "world"}, []string{"hello", "earth"}, nil, nil, 100)
assert.Equal(t, 1, d)
}

func BenchmarkLevenshteinDistance(b *testing.B) {
b.Run("6 char string", func(b *testing.B) {
x := []byte("ABCDEF")
y := []byte("ABCCDEF")

b.ReportAllocs()
for i := 0; i < b.N; i++ {
LevenshteinDistance(x, y, nil, nil)
}
})
b.Run("25-30 char string", func(b *testing.B) {
x := []byte("This is a longer string")
y := []byte("This is a much longer string")

b.ReportAllocs()
for i := 0; i < b.N; i++ {
LevenshteinDistance(x, y, nil, nil)
}
})
b.Run("Long log line", func(b *testing.B) {
x := []byte("10__8__0__146 kernel process Google Chrome Ca[3955] caught causing excessive wakeups. Observed wakeups rate (per sec): 392; Maximum permitted wakeups rate (per sec): 150; Observation period: 300 seconds; Task lifetime number of wakeups: 317314")
y := []byte("10__8__0__146 kernel process Sublime Text[802] caught causing excessive wakeups. Observed wakeups rate (per sec): 233; Maximum permitted wakeups rate (per sec): 150; Observation period: 300 seconds; Task lifetime number of wakeups: 95333")

b.ReportAllocs()
for i := 0; i < b.N; i++ {
LevenshteinDistance(x, y, nil, nil)
}
})
b.Run("Long log line K bound", func(b *testing.B) {
x := []byte("10__8__0__146 kernel process Google Chrome Ca[3955] caught causing excessive wakeups. Observed wakeups rate (per sec): 392; Maximum permitted wakeups rate (per sec): 150; Observation period: 300 seconds; Task lifetime number of wakeups: 317314")
y := []byte("10__8__0__146 kernel process Sublime Text[802] caught causing excessive wakeups. Observed wakeups rate (per sec): 233; Maximum permitted wakeups rate (per sec): 150; Observation period: 300 seconds; Task lifetime number of wakeups: 95333")
Expand All @@ -73,16 +32,5 @@ func BenchmarkLevenshteinDistance(b *testing.B) {
LevenshteinDistanceK(x, y, nil, nil, len(x)/4)
}
})
b.Run("Long log line buffer reuse", func(b *testing.B) {
x := []byte("10__8__0__146 kernel process Google Chrome Ca[3955] caught causing excessive wakeups. Observed wakeups rate (per sec): 392; Maximum permitted wakeups rate (per sec): 150; Observation period: 300 seconds; Task lifetime number of wakeups: 317314")
y := []byte("10__8__0__146 kernel process Sublime Text[802] caught causing excessive wakeups. Observed wakeups rate (per sec): 233; Maximum permitted wakeups rate (per sec): 150; Observation period: 300 seconds; Task lifetime number of wakeups: 95333")
n := len(y)
v0 := make([]int, n+1, n+1)
v1 := make([]int, n+1, n+1)
b.ReportAllocs()
for i := 0; i < b.N; i++ {
LevenshteinDistance(x, y, v0, v1)
}
})

}
10 changes: 5 additions & 5 deletions pkg/processor/levenshtein.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,25 @@ import (
)

type levenshtein struct {
seen [][]byte
seen [][]string
maxDiffRatio float64
}

func newLevenshtein(maxDiffRatio float64) *levenshtein {
seen := make([][]byte, 0)
seen := make([][]string, 0)
return &levenshtein{seen: seen, maxDiffRatio: maxDiffRatio}
}

func (le *levenshtein) process(in Line) Line {
in.IsNew = true
for _, l := range le.seen {
maxDiff := int(math.Ceil(float64(len(in.Input)) * le.maxDiffRatio))
d := levenshtein2.LevenshteinDistanceK(in.Input, l, nil, nil, maxDiff)
maxDiff := int(math.Ceil(float64(len(in.Tokens)) * le.maxDiffRatio))
d := levenshtein2.LevenshteinDistanceK(in.Tokens, l, nil, nil, maxDiff)
if d <= maxDiff && d >= 0 {
in.IsNew = false
return in
}
}
le.seen = append(le.seen, in.Input)
le.seen = append(le.seen, in.Tokens)
return in
}
Loading

0 comments on commit 67148c5

Please sign in to comment.