Skip to content

Commit

Permalink
gt: enable endomorphism + torus
Browse files Browse the repository at this point in the history
  • Loading branch information
mratsim committed Nov 28, 2024
1 parent 823b9f4 commit ff92dd0
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 15 deletions.
1 change: 1 addition & 0 deletions benchmarks/bench_gt_multiexp_bls12_381.nim
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ proc main() =
for numPoints in testNumPoints:
let batchIters = max(1, Iters div numPoints)
ctx12o4.multiExpParallelBench(numPoints, batchIters)
echo "----"
ctx12o6.multiExpParallelBench(numPoints, batchIters)
separator()
separator()
Expand Down
22 changes: 21 additions & 1 deletion benchmarks/bench_gt_parallel_template.nim
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ proc multiExpParallelBench*[GT](ctx: var BenchMultiExpContext[GT], numInputs: in
var startNaive, stopNaive, startMultiExpBaseline, stopMultiExpBaseline: MonoTime
var startMultiExpOpt, stopMultiExpOpt: MonoTime
var startMultiExpPara, stopMultiExpPara: MonoTime
var startMultiExpParaTorus, stopMultiExpParaTorus: MonoTime

when GT is QuadraticExt:
var startMultiExpBaselineTorus: MonoTime
Expand Down Expand Up @@ -186,18 +187,30 @@ proc multiExpParallelBench*[GT](ctx: var BenchMultiExpContext[GT], numInputs: in

startMultiExpPara = getMonotime()
bench("𝔾ₜ multi-exp " & align($ctx.tp.numThreads & " threads", 11) & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
ctx.tp.multiExp_vartime_parallel(r, elems, exponents)
ctx.tp.multiExp_vartime_parallel(r, elems, exponents, useTorus = false)
stopMultiExpPara = getMonotime()

ctx.tp.shutdown()

when GT is QuadraticExt:
block:
ctx.tp = Threadpool.new()

startMultiExpParaTorus = getMonotime()
bench("𝔾ₜ multi-exp torus" & align($ctx.tp.numThreads & " threads", 11) & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
ctx.tp.multiExp_vartime_parallel(r, elems, exponents, useTorus = true)
stopMultiExpParaTorus = getMonotime()

ctx.tp.shutdown()

let perfNaive = inNanoseconds((stopNaive-startNaive) div iters)
let perfMultiExpBaseline = inNanoseconds((stopMultiExpBaseline-startMultiExpBaseline) div iters)
let perfMultiExpOpt = inNanoseconds((stopMultiExpOpt-startMultiExpOpt) div iters)
let perfMultiExpPara = inNanoseconds((stopMultiExpPara-startMultiExpPara) div iters)
when GT is QuadraticExt:
let perfMultiExpBaselineTorus = inNanoseconds((stopMultiExpBaselineTorus-startMultiExpBaselineTorus) div iters)
let perfMultiExpOptTorus = inNanoseconds((stopMultiExpOptTorus-startMultiExpOptTorus) div iters)
let perfMultiExpParaTorus = inNanoSeconds((stopMultiExpParaTorus-startMultiExpParaTorus) div iters)

if numInputs <= 100000:
let speedupBaseline = float(perfNaive) / float(perfMultiExpBaseline)
Expand All @@ -215,3 +228,10 @@ proc multiExpParallelBench*[GT](ctx: var BenchMultiExpContext[GT], numInputs: in

let speedupParaOpt = float(perfMultiExpOpt) / float(perfMultiExpPara)
echo &"Speedup ratio parallel over serial optimized linear combination: {speedupParaOpt:>6.3f}x"

when GT is QuadraticExt:
let speedupParaTorus = float(perfMultiExpOptTorus) / float(perfMultiExpParaTorus)
echo &"Speedup ratio parallel over serial for Torus-based multiexp: {speedupParaTorus:>6.3f}x"

let speedupParaTorusOpt = float(perfMultiExpPara) / float(perfMultiExpParaTorus)
echo &"Speedup ratio parallel over parallel Torus-based multiexp: {speedupParaTorusOpt:>6.3f}x"
31 changes: 17 additions & 14 deletions constantine/math/pairings/gt_multiexp_parallel.nim
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ proc applyEndoTorus_parallel[bits: static int, GT](
# but we could parallel batch convert over the whole range
endoTorusBasis[i].batchFromGT_vartime(endoBasis[i])

let endoTorusElems = cast[ptr UncheckedArray[GT]](endoTorusBasis)
let endoTorusElems = cast[ptr UncheckedArray[T2Aff[F]]](endoTorusBasis)
let endoExpos = cast[ptr UncheckedArray[BigInt[L]]](splitExpos)
freeHeapAligned(endoBasis)

Expand All @@ -258,7 +258,10 @@ template withEndoTorus[exponentsBits: static int, GT](
let (endoTorusElems, endoExpos, endoN) = applyEndoTorus_parallel(tp, elems, expos, N)
# Given that bits and N changed, we are able to use a bigger `c`
# TODO: bench
multiExpProc(tp, r, endoTorusElems, endoExpos, endoN, c)
type F = typeof(elems[0].c0)
var r_torus {.noInit.}: T2Prj[F]
multiExpProc(tp, r_torus.addr, endoTorusElems, endoExpos, endoN, c)
r[].fromTorus2_vartime(r_torus)
freeHeap(endoTorusElems)
freeHeap(endoExpos)
else:
Expand All @@ -283,18 +286,18 @@ proc multiexp_dispatch_vartime_parallel[bits: static int, GT](

when useTorus:
case c
of 2: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 2)
of 3: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 3)
of 4: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 4)
of 5: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 5)
of 6: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 6)
of 7: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 7)
of 8: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 8)
of 9: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 9)
of 10: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 10)
of 11: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 11)
of 12: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 12)
of 13: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 13)
of 2: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 2)
of 3: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 3)
of 4: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 4)
of 5: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 5)
of 6: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 6)
of 7: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 7)
of 8: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 8)
of 9: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 9)
of 10: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 10)
of 11: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 11)
of 12: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 12)
of 13: withEndoTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 13)
of 14: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 14)
of 15: withTorus(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 15)

Expand Down

0 comments on commit ff92dd0

Please sign in to comment.