From 14e8ef63488894653ef51d3f8fe37711ff566888 Mon Sep 17 00:00:00 2001 From: aarzilli Date: Sun, 26 Aug 2018 11:47:08 +0200 Subject: [PATCH] implement a SIMD version of drawFillOver --- context.go | 96 ++++++++++++++++++++++++------ drawfillover_amd64.go | 14 +++++ drawfillover_amd64.s | 77 ++++++++++++++++++++++++ drawfillover_other.go | 31 ++++++++++ drawfillover_test.go | 134 ++++++++++++++++++++++++++++++++++++++++++ shiny.go | 2 +- 6 files changed, 336 insertions(+), 18 deletions(-) create mode 100644 drawfillover_amd64.go create mode 100644 drawfillover_amd64.s create mode 100644 drawfillover_other.go create mode 100644 drawfillover_test.go diff --git a/context.go b/context.go index c4a12fb..8b36222 100644 --- a/context.go +++ b/context.go @@ -283,10 +283,10 @@ func (ctx *context) restackClick(w *Window) bool { } var cnt = 0 -var ln, frect, brrect, frrect, ftri, circ, fcirc, txt int +var ln, frect, frectover, brrect, frrect, ftri, circ, fcirc, txt int func (ctx *context) Draw(wimg *image.RGBA) int { - var txttim, tritim, brecttim, frecttim, frrecttim time.Duration + var txttim, tritim, brecttim, frecttim, frectovertim, frrecttim time.Duration var t0 time.Time img := wimg @@ -331,11 +331,11 @@ func (ctx *context) Draw(wimg *image.RGBA) int { if cmd.Begin.X == cmd.End.X { // draw vertical line r := image.Rect(cmd.Begin.X-h1, cmd.Begin.Y, cmd.Begin.X+h2, cmd.End.Y) - draw.Draw(img, r, colimg, r.Min, op) + drawFill(img, r, colimg, r.Min, op) } else if cmd.Begin.Y == cmd.End.Y { // draw horizontal line r := image.Rect(cmd.Begin.X, cmd.Begin.Y-h1, cmd.End.X, cmd.Begin.Y+h2) - draw.Draw(img, r, colimg, r.Min, op) + drawFill(img, r, colimg, r.Min, op) } else { if rasterizer == nil { setupRasterizer() @@ -399,8 +399,8 @@ func (ctx *context) Draw(wimg *image.RGBA) int { top := image.Rect(body.Min.X, body.Min.Y, body.Max.X, body.Min.Y+border) bot := image.Rect(body.Min.X, body.Max.Y-border, body.Max.X, body.Max.Y) - draw.Draw(img, top, colimg, top.Min, op) - draw.Draw(img, bot, colimg, bot.Min, op) + drawFill(img, top, colimg, top.Min, op) + drawFill(img, bot, colimg, bot.Min, op) if border < int(cmd.Rounding) { // wings need shrinking @@ -414,23 +414,27 @@ func (ctx *context) Draw(wimg *image.RGBA) int { xlwing := image.Rect(top.Min.X, top.Max.Y, top.Min.X+d, bot.Min.Y) xrwing := image.Rect(top.Max.X-d, top.Max.Y, top.Max.X, bot.Min.Y) - draw.Draw(img, xlwing, colimg, xlwing.Min, op) - draw.Draw(img, xrwing, colimg, xrwing.Min, op) + drawFill(img, xlwing, colimg, xlwing.Min, op) + drawFill(img, xrwing, colimg, xrwing.Min, op) } brrect++ } else { - draw.Draw(img, body, colimg, body.Min, op) + drawFill(img, body, colimg, body.Min, op) if cmd.Rounding == 0 { - frect++ + if op == draw.Src { + frect++ + } else { + frectover++ + } } else { frrect++ } } if rounding { - draw.Draw(img, lwing, colimg, lwing.Min, op) - draw.Draw(img, rwing, colimg, rwing.Min, op) + drawFill(img, lwing, colimg, lwing.Min, op) + drawFill(img, rwing, colimg, rwing.Min, op) rangle := math.Pi / 2 @@ -454,7 +458,15 @@ func (ctx *context) Draw(wimg *image.RGBA) int { if cmd.Rounding > 0 { frrecttim += time.Now().Sub(t0) } else { - frecttim += time.Now().Sub(t0) + d := time.Now().Sub(t0) + if op == draw.Src { + frecttim += d + } else { + if d > 8*time.Millisecond { + fmt.Printf("outstanding rect") + } + frectovertim += d + } } } } @@ -529,13 +541,13 @@ func (ctx *context) Draw(wimg *image.RGBA) int { } if perfUpdate { - fmt.Printf("triangle: %0.4fms text: %0.4fms brect: %0.4fms frect: %0.4fms frrect %0.4f\n", tritim.Seconds()*1000, txttim.Seconds()*1000, brecttim.Seconds()*1000, frecttim.Seconds()*1000, frrecttim.Seconds()*1000) + fmt.Printf("triangle: %0.4fms text: %0.4fms brect: %0.4fms frect: %0.4fms frectover: %0.4fms frrect %0.4f\n", tritim.Seconds()*1000, txttim.Seconds()*1000, brecttim.Seconds()*1000, frecttim.Seconds()*1000, frectovertim.Seconds()*1000, frrecttim.Seconds()*1000) } cnt++ - if perfUpdate && (cnt%100) == 0 { - fmt.Printf("ln %d, frect %d, frrect %d, brrect %d, ftri %d, circ %d, fcirc %d, txt %d\n", ln, frect, frrect, brrect, ftri, circ, fcirc, txt) - ln, frect, frrect, brrect, ftri, circ, fcirc, txt = 0, 0, 0, 0, 0, 0, 0, 0 + if perfUpdate /*&& (cnt%100) == 0*/ { + fmt.Printf("ln %d, frect %d, frectover %d, frrect %d, brrect %d, ftri %d, circ %d, fcirc %d, txt %d\n", ln, frect, frectover, frrect, brrect, ftri, circ, fcirc, txt) + ln, frect, frectover, frrect, brrect, ftri, circ, fcirc, txt = 0, 0, 0, 0, 0, 0, 0, 0, 0 } return len(ctx.cmds) @@ -942,3 +954,53 @@ func percentages(bounds rect.Rect, f float64) (r [4]rect.Rect) { r[3].W = pw return } + +func clip(dst *image.RGBA, r *image.Rectangle, src image.Image, sp *image.Point) { + orig := r.Min + *r = r.Intersect(dst.Bounds()) + *r = r.Intersect(src.Bounds().Add(orig.Sub(*sp))) + dx := r.Min.X - orig.X + dy := r.Min.Y - orig.Y + if dx == 0 && dy == 0 { + return + } + sp.X += dx + sp.Y += dy +} + +func drawFill(dst *image.RGBA, r image.Rectangle, src *image.Uniform, sp image.Point, op draw.Op) { + clip(dst, &r, src, &sp) + sr, sg, sb, sa := src.RGBA() + switch op { + case draw.Over: + drawFillOver(dst, r, sr, sg, sb, sa) + case draw.Src: + drawFillSrc(dst, r, sr, sg, sb, sa) + default: + draw.Draw(dst, r, src, sp, op) + } +} + +func drawFillSrc(dst *image.RGBA, r image.Rectangle, sr, sg, sb, sa uint32) { + sr8 := uint8(sr >> 8) + sg8 := uint8(sg >> 8) + sb8 := uint8(sb >> 8) + sa8 := uint8(sa >> 8) + // The built-in copy function is faster than a straightforward for loop to fill the destination with + // the color, but copy requires a slice source. We therefore use a for loop to fill the first row, and + // then use the first row as the slice source for the remaining rows. + i0 := dst.PixOffset(r.Min.X, r.Min.Y) + i1 := i0 + r.Dx()*4 + for i := i0; i < i1; i += 4 { + dst.Pix[i+0] = sr8 + dst.Pix[i+1] = sg8 + dst.Pix[i+2] = sb8 + dst.Pix[i+3] = sa8 + } + firstRow := dst.Pix[i0:i1] + for y := r.Min.Y + 1; y < r.Max.Y; y++ { + i0 += dst.Stride + i1 += dst.Stride + copy(dst.Pix[i0:i1], firstRow) + } +} diff --git a/drawfillover_amd64.go b/drawfillover_amd64.go new file mode 100644 index 0000000..fa52e26 --- /dev/null +++ b/drawfillover_amd64.go @@ -0,0 +1,14 @@ +package nucular + +import "image" + +func drawFillOver_SIMD_internal(base *uint8, i0, i1 int, stride, n int, adivm, sr, sg, sb, sa uint32) + +func drawFillOver(dst *image.RGBA, r image.Rectangle, sr, sg, sb, sa uint32) { + const m = 1<<16 - 1 + a := (m - sa) * 0x101 + adivm := a / m + i0 := dst.PixOffset(r.Min.X, r.Min.Y) + i1 := i0 + r.Dx()*4 + drawFillOver_SIMD_internal(&dst.Pix[0], i0, i1, dst.Stride, r.Max.Y-r.Min.Y, adivm, sr, sg, sb, sa) +} diff --git a/drawfillover_amd64.s b/drawfillover_amd64.s new file mode 100644 index 0000000..1515d09 --- /dev/null +++ b/drawfillover_amd64.s @@ -0,0 +1,77 @@ +#include "textflag.h" + +GLOBL drawFillOver_SIMD_shufflemap<>(SB), (NOPTR+RODATA), $4 +DATA drawFillOver_SIMD_shufflemap<>+0x00(SB)/4, $0x0d090501 + +TEXT ·drawFillOver_SIMD_internal(SB),0,$0-60 + // base+0(FP) + // i0+8(FP) + // i1+16(FP) + // stride+24(FP) + // n+32(FP) + // adivm+40(FP) + // sr+44(FP) + // sg+48(FP) + // sb+52(FP) + // sa+56(FP) + + // DX row index + // CX column index + // AX pointer to current pixel + // R14 i0 + // R15 i1 + + // X0 zeroed register + // X1 current pixel + // X3 source pixel + // X4 is the shuffle map to do the >> 8 and pack everything back into a single 32bit value + + MOVSS drawFillOver_SIMD_shufflemap<>(SB), X4 + + PXOR X0, X0 + MOVQ i0+8(FP), R14 + MOVQ i1+16(FP), R15 + + // load adivm to X2, fill all uint32s with it + MOVSS advim+40(FP), X2 + VBROADCASTSS X2, X2 + + // load source pixel to X3 + VMOVDQU sr+44(FP), X3 + + MOVQ $0, DX +row_loop: + CMPQ DX, n+32(FP) + JGE row_loop_end + + MOVQ R14, CX + MOVQ base+0(FP), AX + LEAQ (AX)(CX*1), AX +column_loop: + CMPQ CX, R15 + JGE column_loop_end + + // load current pixel to X1, unpack twice to get uint32s + MOVSS (AX), X1 + PUNPCKLBW X0, X1 + VPUNPCKLWD X0, X1, X1 + + VPMULLD X2, X1, X1 // component * a/m + VPADDD X3, X1, X1 // (component * a/m) + source_component + + VPSHUFB X4, X1, X1 // get the second byte of every 32bit word and pack it into the lowest word of X1 + MOVSS X1, (AX) // write back to memory + + ADDQ $4, CX + ADDQ $4, AX + JMP column_loop + +column_loop_end: + ADDQ stride+24(FP), R14 + ADDQ stride+24(FP), R15 + INCQ DX + JMP row_loop + +row_loop_end: + + RET diff --git a/drawfillover_other.go b/drawfillover_other.go new file mode 100644 index 0000000..7a412b6 --- /dev/null +++ b/drawfillover_other.go @@ -0,0 +1,31 @@ +// +build !amd64 + +package nucular + +import ( + "image" +) + +func drawFillOver(dst *image.RGBA, r image.Rectangle, sr, sg, sb, sa uint32) { + fmt.Printf("fucked up!\n") + const m = 1<<16 - 1 + // The 0x101 is here for the same reason as in drawRGBA. + a := (m - sa) * 0x101 + i0 := dst.PixOffset(r.Min.X, r.Min.Y) + i1 := i0 + r.Dx()*4 + for y := r.Min.Y; y != r.Max.Y; y++ { + for i := i0; i < i1; i += 4 { + dr := &dst.Pix[i+0] + dg := &dst.Pix[i+1] + db := &dst.Pix[i+2] + da := &dst.Pix[i+3] + + *dr = uint8((uint32(*dr)*a/m + sr) >> 8) + *dg = uint8((uint32(*dg)*a/m + sg) >> 8) + *db = uint8((uint32(*db)*a/m + sb) >> 8) + *da = uint8((uint32(*da)*a/m + sa) >> 8) + } + i0 += dst.Stride + i1 += dst.Stride + } +} diff --git a/drawfillover_test.go b/drawfillover_test.go new file mode 100644 index 0000000..fcd5cab --- /dev/null +++ b/drawfillover_test.go @@ -0,0 +1,134 @@ +package nucular + +import ( + "image" + "testing" +) + +func drawFillOver_Normal(dst *image.RGBA, r image.Rectangle, sr, sg, sb, sa uint32) { + const m = 1<<16 - 1 + // The 0x101 is here for the same reason as in drawRGBA. + a := (m - sa) * 0x101 + i0 := dst.PixOffset(r.Min.X, r.Min.Y) + i1 := i0 + r.Dx()*4 + for y := r.Min.Y; y != r.Max.Y; y++ { + for i := i0; i < i1; i += 4 { + dr := &dst.Pix[i+0] + dg := &dst.Pix[i+1] + db := &dst.Pix[i+2] + da := &dst.Pix[i+3] + + *dr = uint8((uint32(*dr)*a/m + sr) >> 8) + *dg = uint8((uint32(*dg)*a/m + sg) >> 8) + *db = uint8((uint32(*db)*a/m + sb) >> 8) + *da = uint8((uint32(*da)*a/m + sa) >> 8) + } + i0 += dst.Stride + i1 += dst.Stride + } +} + +func drawFillOver_NoPtr(dst *image.RGBA, r image.Rectangle, sr, sg, sb, sa uint32) { + const m = 1<<16 - 1 + // The 0x101 is here for the same reason as in drawRGBA. + a := (m - sa) * 0x101 + i0 := dst.PixOffset(r.Min.X, r.Min.Y) + i1 := i0 + r.Dx()*4 + for y := r.Min.Y; y != r.Max.Y; y++ { + for i := i0; i < i1; i += 4 { + dst.Pix[i+0] = uint8((uint32(dst.Pix[i+0])*a/m + sr) >> 8) + dst.Pix[i+1] = uint8((uint32(dst.Pix[i+1])*a/m + sg) >> 8) + dst.Pix[i+2] = uint8((uint32(dst.Pix[i+2])*a/m + sb) >> 8) + dst.Pix[i+3] = uint8((uint32(dst.Pix[i+3])*a/m + sa) >> 8) + } + i0 += dst.Stride + i1 += dst.Stride + } +} + +func drawFillOver_SIMD(dst *image.RGBA, r image.Rectangle, sr, sg, sb, sa uint32) { + const m = 1<<16 - 1 + a := (m - sa) * 0x101 + adivm := a / m + i0 := dst.PixOffset(r.Min.X, r.Min.Y) + i1 := i0 + r.Dx()*4 + drawFillOver_SIMD_internal(&dst.Pix[0], i0, i1, dst.Stride, r.Max.Y-r.Min.Y, adivm, sr, sg, sb, sa) +} + +func clearImg(b *image.RGBA) { + for i := 0; i < len(b.Pix); i += 4 { + b.Pix[i+0] = 50 + b.Pix[i+1] = 50 + b.Pix[i+2] = 50 + b.Pix[i+3] = 255 + } +} + +func checkUniform(t *testing.T, b *image.RGBA, tgtr, tgtg, tgtb, tgta uint8) { + ok := true + for i := 0; i < len(b.Pix); i += 4 { + if b.Pix[i+0] != tgtr { + ok = false + t.Errorf("mismatch at pixel %d (red) %d %d\n", i/4, b.Pix[i+0], tgtr) + } + if b.Pix[i+1] != tgtg { + ok = false + t.Errorf("mismatch at pixel %d (green) %d %d\n", i/4, b.Pix[i+1], tgtg) + } + if b.Pix[i+2] != tgtb { + ok = false + t.Errorf("mismatch at pixel %d (blue) %d %d\n", i/4, b.Pix[i+2], tgtb) + } + if b.Pix[i+3] != tgta { + ok = false + t.Errorf("mismatch at pixel %d (alpha) %d %d\n", i/4, b.Pix[i+3], tgta) + } + if !ok { + t.Fatal("previous errors") + } + } + outr, outg, outb, outa := b.Pix[0], b.Pix[1], b.Pix[2], b.Pix[3] + t.Logf("color %d %d %d %d\n", outr, outg, outb, outa) +} + +type fillOverFunc func(b *image.RGBA, r image.Rectangle, sr, sg, sb, sa uint32) + +func testFillOver(t *testing.T, b *image.RGBA, fo fillOverFunc) { + clearImg(b) + fo(b, b.Bounds(), 12850, 14906, 15677, 57825) + checkUniform(t, b, 56, 64, 67, 255) +} + +func TestDrawFillOver(t *testing.T) { + b := image.NewRGBA(image.Rect(0, 0, 2550, 1400)) + testFillOver(t, b, drawFillOver_Normal) + testFillOver(t, b, drawFillOver_NoPtr) + testFillOver(t, b, drawFillOver_SIMD) +} + +func benchFillOver(bnc *testing.B, fo fillOverFunc) { + bnc.StopTimer() + b := image.NewRGBA(image.Rect(0, 0, 2550, 1400)) + + for n := 0; n < bnc.N; n++ { + clearImg(b) + bnc.StartTimer() + fo(b, b.Bounds(), 12850, 14906, 15677, 57825) + bnc.StopTimer() + } + +} + +// go test -bench=DrawFillOver -run=NONE -v + +func BenchmarkDrawFillOverNormal(bnc *testing.B) { // 18734046 ns/op + benchFillOver(bnc, drawFillOver_Normal) +} + +func BenchmarkDrawFillOverNoPtr(bnc *testing.B) { // 19357654 ns/op + benchFillOver(bnc, drawFillOver_NoPtr) +} + +func BenchmarkDrawFillOverSIMD(bnc *testing.B) { // 4644812 ns/op + benchFillOver(bnc, drawFillOver_SIMD) +} diff --git a/shiny.go b/shiny.go index 984996a..067cd88 100644 --- a/shiny.go +++ b/shiny.go @@ -363,7 +363,7 @@ func (w *masterWindow) updateLocked() { t1 = time.Now() } nprimitives := w.draw() - if perfUpdate { + if perfUpdate && nprimitives > 0 { te = time.Now() fps := 1.0 / te.Sub(t0).Seconds()