From d90c332cf8dbe4c209aaedc350e830ac6a6bfb9f Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Tue, 4 Jan 2022 17:16:11 +0100 Subject: [PATCH 01/16] feat(GT): exponentiation in the cyclotomic subgroups of E6, E12, E24 --- ecc/bls12-377/internal/fptower/e12.go | 21 ++++++++++++++++ ecc/bls12-377/internal/fptower/e12_test.go | 25 +++++++++++++++++++ ecc/bls12-381/internal/fptower/e12.go | 21 ++++++++++++++++ ecc/bls12-381/internal/fptower/e12_test.go | 25 +++++++++++++++++++ ecc/bls24-315/internal/fptower/e24.go | 21 ++++++++++++++++ ecc/bls24-315/internal/fptower/e24_test.go | 25 +++++++++++++++++++ ecc/bn254/internal/fptower/e12.go | 21 ++++++++++++++++ ecc/bn254/internal/fptower/e12_test.go | 25 +++++++++++++++++++ ecc/bw6-633/internal/fptower/e6.go | 21 ++++++++++++++++ ecc/bw6-633/internal/fptower/e6_test.go | 25 +++++++++++++++++++ ecc/bw6-761/internal/fptower/e6.go | 21 ++++++++++++++++ ecc/bw6-761/internal/fptower/e6_test.go | 25 +++++++++++++++++++ .../template/fq12over6over2/fq12.go.tmpl | 21 ++++++++++++++++ .../fq12over6over2/tests/fq12.go.tmpl | 25 +++++++++++++++++++ 14 files changed, 322 insertions(+) diff --git a/ecc/bls12-377/internal/fptower/e12.go b/ecc/bls12-377/internal/fptower/e12.go index cbc2606db8..06d129ad98 100644 --- a/ecc/bls12-377/internal/fptower/e12.go +++ b/ecc/bls12-377/internal/fptower/e12.go @@ -19,6 +19,7 @@ package fptower import ( "encoding/binary" "errors" + "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-377/fp" "math/big" ) @@ -387,6 +388,26 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 { return z } +// CyclotomicExp sets z=x**e and returns it +// x must be in the cyclotomic subgroup +func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { + var res, xInv E12 + xInv.InverseUnitary(x) + res.SetOne() + eNAF := make([]int8, e.BitLen()+3) + n := ecc.NafDecomposition(&e, eNAF[:]) + for i := n - 1; i >= 0; i-- { + res.CyclotomicSquare(&res) + if eNAF[i] == 1 { + res.Mul(&res, x) + } else if eNAF[i] == -1 { + res.Mul(&res, &xInv) + } + } + z.Set(&res) + return z +} + // InverseUnitary inverse a unitary element func (z *E12) InverseUnitary(x *E12) *E12 { return z.Conjugate(x) diff --git a/ecc/bls12-377/internal/fptower/e12_test.go b/ecc/bls12-377/internal/fptower/e12_test.go index a7cfec865a..c3ec4804f9 100644 --- a/ecc/bls12-377/internal/fptower/e12_test.go +++ b/ecc/bls12-377/internal/fptower/e12_test.go @@ -17,6 +17,7 @@ package fptower import ( + "math/big" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-377/fp" @@ -192,6 +193,7 @@ func TestE12Ops(t *testing.T) { genA := GenE12() genB := GenE12() + genExp := GenFp() properties.Property("[BLS12-377] sub & add should leave an element invariant", prop.ForAll( func(a, b *E12) bool { @@ -349,6 +351,29 @@ func TestE12Ops(t *testing.T) { genA, )) + properties.Property("[BLS12-377] Exp and CyclotomicExp results must be the same in the cyclotomic subgroup", prop.ForAll( + func(a *E12, e fp.Element) bool { + var b, c, d E12 + // put in the cyclo subgroup + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.FrobeniusSquare(&b).Mul(a, &b) + + var _e big.Int + k := new(big.Int).SetUint64(12) + e.Exp(e, k) + e.ToBigIntRegular(&_e) + + c.Exp(a, _e) + d.CyclotomicExp(a, _e) + + return c.Equal(&d) + }, + genA, + genExp, + )) + properties.Property("[BLS12-377] Frobenius of x in E12 should be equal to x^q", prop.ForAll( func(a *E12) bool { var b, c E12 diff --git a/ecc/bls12-381/internal/fptower/e12.go b/ecc/bls12-381/internal/fptower/e12.go index bb92cae533..8935d7b656 100644 --- a/ecc/bls12-381/internal/fptower/e12.go +++ b/ecc/bls12-381/internal/fptower/e12.go @@ -19,6 +19,7 @@ package fptower import ( "encoding/binary" "errors" + "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-381/fp" "math/big" ) @@ -387,6 +388,26 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 { return z } +// CyclotomicExp sets z=x**e and returns it +// x must be in the cyclotomic subgroup +func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { + var res, xInv E12 + xInv.InverseUnitary(x) + res.SetOne() + eNAF := make([]int8, e.BitLen()+3) + n := ecc.NafDecomposition(&e, eNAF[:]) + for i := n - 1; i >= 0; i-- { + res.CyclotomicSquare(&res) + if eNAF[i] == 1 { + res.Mul(&res, x) + } else if eNAF[i] == -1 { + res.Mul(&res, &xInv) + } + } + z.Set(&res) + return z +} + // InverseUnitary inverse a unitary element func (z *E12) InverseUnitary(x *E12) *E12 { return z.Conjugate(x) diff --git a/ecc/bls12-381/internal/fptower/e12_test.go b/ecc/bls12-381/internal/fptower/e12_test.go index 6901a716e0..4011baaab2 100644 --- a/ecc/bls12-381/internal/fptower/e12_test.go +++ b/ecc/bls12-381/internal/fptower/e12_test.go @@ -17,6 +17,7 @@ package fptower import ( + "math/big" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-381/fp" @@ -192,6 +193,7 @@ func TestE12Ops(t *testing.T) { genA := GenE12() genB := GenE12() + genExp := GenFp() properties.Property("[BLS12-381] sub & add should leave an element invariant", prop.ForAll( func(a, b *E12) bool { @@ -349,6 +351,29 @@ func TestE12Ops(t *testing.T) { genA, )) + properties.Property("[BLS12-381] Exp and CyclotomicExp results must be the same in the cyclotomic subgroup", prop.ForAll( + func(a *E12, e fp.Element) bool { + var b, c, d E12 + // put in the cyclo subgroup + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.FrobeniusSquare(&b).Mul(a, &b) + + var _e big.Int + k := new(big.Int).SetUint64(12) + e.Exp(e, k) + e.ToBigIntRegular(&_e) + + c.Exp(a, _e) + d.CyclotomicExp(a, _e) + + return c.Equal(&d) + }, + genA, + genExp, + )) + properties.Property("[BLS12-381] Frobenius of x in E12 should be equal to x^q", prop.ForAll( func(a *E12) bool { var b, c E12 diff --git a/ecc/bls24-315/internal/fptower/e24.go b/ecc/bls24-315/internal/fptower/e24.go index c76a520102..ed8444f448 100644 --- a/ecc/bls24-315/internal/fptower/e24.go +++ b/ecc/bls24-315/internal/fptower/e24.go @@ -18,6 +18,7 @@ package fptower import ( "errors" + "github.com/consensys/gnark-crypto/ecc" "math/big" ) @@ -385,6 +386,26 @@ func (z *E24) Exp(x *E24, e big.Int) *E24 { return z } +// CyclotomicExp sets z=x**e and returns it +// x must be in the cyclotomic subgroup +func (z *E24) CyclotomicExp(x *E24, e big.Int) *E24 { + var res, xInv E24 + xInv.InverseUnitary(x) + res.SetOne() + eNAF := make([]int8, e.BitLen()+3) + n := ecc.NafDecomposition(&e, eNAF[:]) + for i := n - 1; i >= 0; i-- { + res.CyclotomicSquare(&res) + if eNAF[i] == 1 { + res.Mul(&res, x) + } else if eNAF[i] == -1 { + res.Mul(&res, &xInv) + } + } + z.Set(&res) + return z +} + // InverseUnitary inverse a unitary element func (z *E24) InverseUnitary(x *E24) *E24 { return z.Conjugate(x) diff --git a/ecc/bls24-315/internal/fptower/e24_test.go b/ecc/bls24-315/internal/fptower/e24_test.go index 32edf7e915..a67224737e 100644 --- a/ecc/bls24-315/internal/fptower/e24_test.go +++ b/ecc/bls24-315/internal/fptower/e24_test.go @@ -17,6 +17,7 @@ package fptower import ( + "math/big" "testing" "github.com/consensys/gnark-crypto/ecc/bls24-315/fp" @@ -192,6 +193,7 @@ func TestE24Ops(t *testing.T) { genA := GenE24() genB := GenE24() + genExp := GenFp() properties.Property("[BLS24-315] sub & add should leave an element invariant", prop.ForAll( func(a, b *E24) bool { @@ -371,6 +373,29 @@ func TestE24Ops(t *testing.T) { genA, )) + properties.Property("[BLS24-315] Exp and CyclotomicExp results must be the same in the cyclotomic subgroup", prop.ForAll( + func(a *E24, e fp.Element) bool { + var b, c, d E24 + // put in the cyclo subgroup + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.FrobeniusQuad(&b).Mul(a, &b) + + var _e big.Int + k := new(big.Int).SetUint64(24) + e.Exp(e, k) + e.ToBigIntRegular(&_e) + + c.Exp(a, _e) + d.CyclotomicExp(a, _e) + + return c.Equal(&d) + }, + genA, + genExp, + )) + properties.Property("[BLS24-315] Frobenius of x in E24 should be equal to x^q", prop.ForAll( func(a *E24) bool { var b, c E24 diff --git a/ecc/bn254/internal/fptower/e12.go b/ecc/bn254/internal/fptower/e12.go index 3f8c763fb8..e29f7d0a93 100644 --- a/ecc/bn254/internal/fptower/e12.go +++ b/ecc/bn254/internal/fptower/e12.go @@ -19,6 +19,7 @@ package fptower import ( "encoding/binary" "errors" + "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bn254/fp" "math/big" ) @@ -387,6 +388,26 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 { return z } +// CyclotomicExp sets z=x**e and returns it +// x must be in the cyclotomic subgroup +func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { + var res, xInv E12 + xInv.InverseUnitary(x) + res.SetOne() + eNAF := make([]int8, e.BitLen()+3) + n := ecc.NafDecomposition(&e, eNAF[:]) + for i := n - 1; i >= 0; i-- { + res.CyclotomicSquare(&res) + if eNAF[i] == 1 { + res.Mul(&res, x) + } else if eNAF[i] == -1 { + res.Mul(&res, &xInv) + } + } + z.Set(&res) + return z +} + // InverseUnitary inverse a unitary element func (z *E12) InverseUnitary(x *E12) *E12 { return z.Conjugate(x) diff --git a/ecc/bn254/internal/fptower/e12_test.go b/ecc/bn254/internal/fptower/e12_test.go index 6c56c86805..942720cbe3 100644 --- a/ecc/bn254/internal/fptower/e12_test.go +++ b/ecc/bn254/internal/fptower/e12_test.go @@ -17,6 +17,7 @@ package fptower import ( + "math/big" "testing" "github.com/consensys/gnark-crypto/ecc/bn254/fp" @@ -192,6 +193,7 @@ func TestE12Ops(t *testing.T) { genA := GenE12() genB := GenE12() + genExp := GenFp() properties.Property("[BN254] sub & add should leave an element invariant", prop.ForAll( func(a, b *E12) bool { @@ -349,6 +351,29 @@ func TestE12Ops(t *testing.T) { genA, )) + properties.Property("[BN254] Exp and CyclotomicExp results must be the same in the cyclotomic subgroup", prop.ForAll( + func(a *E12, e fp.Element) bool { + var b, c, d E12 + // put in the cyclo subgroup + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.FrobeniusSquare(&b).Mul(a, &b) + + var _e big.Int + k := new(big.Int).SetUint64(12) + e.Exp(e, k) + e.ToBigIntRegular(&_e) + + c.Exp(a, _e) + d.CyclotomicExp(a, _e) + + return c.Equal(&d) + }, + genA, + genExp, + )) + properties.Property("[BN254] Frobenius of x in E12 should be equal to x^q", prop.ForAll( func(a *E12) bool { var b, c E12 diff --git a/ecc/bw6-633/internal/fptower/e6.go b/ecc/bw6-633/internal/fptower/e6.go index 19e6c860e7..c938b91f3f 100644 --- a/ecc/bw6-633/internal/fptower/e6.go +++ b/ecc/bw6-633/internal/fptower/e6.go @@ -20,6 +20,7 @@ import ( "errors" "math/big" + "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-633/fp" ) @@ -330,6 +331,26 @@ func (z *E6) Exp(x *E6, e big.Int) *E6 { return z } +// CyclotomicExp sets z=x**e and returns it +// x must be in the cyclotomic subgroup +func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 { + var res, xInv E6 + xInv.InverseUnitary(x) + res.SetOne() + eNAF := make([]int8, e.BitLen()+3) + n := ecc.NafDecomposition(&e, eNAF[:]) + for i := n - 1; i >= 0; i-- { + res.CyclotomicSquare(&res) + if eNAF[i] == 1 { + res.Mul(&res, x) + } else if eNAF[i] == -1 { + res.Mul(&res, &xInv) + } + } + z.Set(&res) + return z +} + // InverseUnitary inverse a unitary element func (z *E6) InverseUnitary(x *E6) *E6 { return z.Conjugate(x) diff --git a/ecc/bw6-633/internal/fptower/e6_test.go b/ecc/bw6-633/internal/fptower/e6_test.go index 49e6cb7bf0..51c5cbce43 100644 --- a/ecc/bw6-633/internal/fptower/e6_test.go +++ b/ecc/bw6-633/internal/fptower/e6_test.go @@ -17,6 +17,7 @@ package fptower import ( + "math/big" "testing" "github.com/consensys/gnark-crypto/ecc/bw6-633/fp" @@ -172,6 +173,7 @@ func TestE6Ops(t *testing.T) { genA := GenE6() genB := GenE6() + genExp := GenFp() properties.Property("[BW6-633] sub & add should leave an element invariant", prop.ForAll( func(a, b *E6) bool { @@ -277,6 +279,29 @@ func TestE6Ops(t *testing.T) { genA, )) + properties.Property("[BW6-633] Exp and CyclotomicExp results must be the same in the cyclotomic subgroup", prop.ForAll( + func(a *E6, e fp.Element) bool { + var b, c, d E6 + // put in the cyclo subgroup + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.Frobenius(&b).Mul(a, &b) + + var _e big.Int + k := new(big.Int).SetUint64(6) + e.Exp(e, k) + e.ToBigIntRegular(&_e) + + c.Exp(a, _e) + d.CyclotomicExp(a, _e) + + return c.Equal(&d) + }, + genA, + genExp, + )) + properties.Property("[BW6-633] Frobenius of x in E6 should be equal to x^q", prop.ForAll( func(a *E6) bool { var b, c E6 diff --git a/ecc/bw6-761/internal/fptower/e6.go b/ecc/bw6-761/internal/fptower/e6.go index b8027d9bcf..6261475843 100644 --- a/ecc/bw6-761/internal/fptower/e6.go +++ b/ecc/bw6-761/internal/fptower/e6.go @@ -20,6 +20,7 @@ import ( "errors" "math/big" + "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-761/fp" ) @@ -329,6 +330,26 @@ func (z *E6) Exp(x *E6, e big.Int) *E6 { return z } +// CyclotomicExp sets z=x**e and returns it +// x must be in the cyclotomic subgroup +func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 { + var res, xInv E6 + xInv.InverseUnitary(x) + res.SetOne() + eNAF := make([]int8, e.BitLen()+3) + n := ecc.NafDecomposition(&e, eNAF[:]) + for i := n - 1; i >= 0; i-- { + res.CyclotomicSquare(&res) + if eNAF[i] == 1 { + res.Mul(&res, x) + } else if eNAF[i] == -1 { + res.Mul(&res, &xInv) + } + } + z.Set(&res) + return z +} + // InverseUnitary inverse a unitary element func (z *E6) InverseUnitary(x *E6) *E6 { return z.Conjugate(x) diff --git a/ecc/bw6-761/internal/fptower/e6_test.go b/ecc/bw6-761/internal/fptower/e6_test.go index 44aeb66186..0e693628cc 100644 --- a/ecc/bw6-761/internal/fptower/e6_test.go +++ b/ecc/bw6-761/internal/fptower/e6_test.go @@ -17,6 +17,7 @@ package fptower import ( + "math/big" "testing" "github.com/consensys/gnark-crypto/ecc/bw6-761/fp" @@ -172,6 +173,7 @@ func TestE6Ops(t *testing.T) { genA := GenE6() genB := GenE6() + genExp := GenFp() properties.Property("[BW6-761] sub & add should leave an element invariant", prop.ForAll( func(a, b *E6) bool { @@ -277,6 +279,29 @@ func TestE6Ops(t *testing.T) { genA, )) + properties.Property("[BW6-761] Exp and CyclotomicExp results must be the same in the cyclotomic subgroup", prop.ForAll( + func(a *E6, e fp.Element) bool { + var b, c, d E6 + // put in the cyclo subgroup + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.Frobenius(&b).Mul(a, &b) + + var _e big.Int + k := new(big.Int).SetUint64(6) + e.Exp(e, k) + e.ToBigIntRegular(&_e) + + c.Exp(a, _e) + d.CyclotomicExp(a, _e) + + return c.Equal(&d) + }, + genA, + genExp, + )) + properties.Property("[BW6-761] Frobenius of x in E6 should be equal to x^q", prop.ForAll( func(a *E6) bool { var b, c E6 diff --git a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl index 07fd9fad1b..1982772154 100644 --- a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl +++ b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl @@ -2,6 +2,7 @@ import ( "math/big" "encoding/binary" "errors" + "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/{{.Name}}/fp" ) @@ -370,6 +371,26 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 { return z } +// CyclotomicExp sets z=x**e and returns it +// x must be in the cyclotomic subgroup +func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { + var res, xInv E12 + xInv.InverseUnitary(x) + res.SetOne() + eNAF := make([]int8, e.BitLen()+3) + n := ecc.NafDecomposition(&e, eNAF[:]) + for i := n - 1; i >= 0; i-- { + res.CyclotomicSquare(&res) + if eNAF[i] == 1 { + res.Mul(&res, x) + } else if eNAF[i] == -1 { + res.Mul(&res, &xInv) + } + } + z.Set(&res) + return z +} + // InverseUnitary inverse a unitary element func (z *E12) InverseUnitary(x *E12) *E12 { return z.Conjugate(x) diff --git a/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl b/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl index 126132c0d0..abd986863f 100644 --- a/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl +++ b/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl @@ -1,4 +1,5 @@ import ( + "math/big" "testing" "github.com/consensys/gnark-crypto/ecc/{{.Name}}/fp" @@ -174,6 +175,7 @@ func TestE12Ops(t *testing.T) { genA := GenE12() genB := GenE12() + genExp := GenFp() properties.Property("[{{ toUpper .Name }}] sub & add should leave an element invariant", prop.ForAll( func(a, b *E12) bool { @@ -331,6 +333,29 @@ func TestE12Ops(t *testing.T) { genA, )) + properties.Property("[{{ toUpper .Name }}] Exp and CyclotomicExp results must be the same in the cyclotomic subgroup", prop.ForAll( + func(a *E12, e fp.Element) bool { + var b, c, d E12 + // put in the cyclo subgroup + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.FrobeniusSquare(&b).Mul(a, &b) + + var _e big.Int + k := new(big.Int).SetUint64(12) + e.Exp(e, k) + e.ToBigIntRegular(&_e) + + c.Exp(a, _e) + d.CyclotomicExp(a, _e) + + return c.Equal(&d) + }, + genA, + genExp, + )) + properties.Property("[{{ toUpper .Name }}] Frobenius of x in E12 should be equal to x^q", prop.ForAll( func(a *E12) bool { var b, c E12 From feb3e16e573a3cb07669f753f74f1d0f800dc8c9 Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Tue, 4 Jan 2022 18:29:59 +0100 Subject: [PATCH 02/16] perf(E12, E24, E6): exponentiation using 2-bit windowing method --- ecc/bls12-377/internal/fptower/e12.go | 22 ++++++++++++++----- ecc/bls12-381/internal/fptower/e12.go | 22 ++++++++++++++----- ecc/bls24-315/internal/fptower/e24.go | 22 ++++++++++++++----- ecc/bn254/internal/fptower/e12.go | 22 ++++++++++++++----- ecc/bw6-633/internal/fptower/e6.go | 22 ++++++++++++++----- ecc/bw6-761/internal/fptower/e6.go | 22 ++++++++++++++----- .../template/fq12over6over2/fq12.go.tmpl | 22 ++++++++++++++----- 7 files changed, 112 insertions(+), 42 deletions(-) diff --git a/ecc/bls12-377/internal/fptower/e12.go b/ecc/bls12-377/internal/fptower/e12.go index 06d129ad98..6939f52eb8 100644 --- a/ecc/bls12-377/internal/fptower/e12.go +++ b/ecc/bls12-377/internal/fptower/e12.go @@ -369,22 +369,32 @@ func (z *E12) Inverse(x *E12) *E12 { } // Exp sets z=x**e and returns it +// uses 2-bits windowed method func (z *E12) Exp(x *E12, e big.Int) *E12 { + var res E12 + var ops [3]E12 + res.SetOne() + ops[0].Set(x) + ops[1].Square(&ops[0]) + ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) + b := e.Bytes() for i := range b { w := b[i] - mask := byte(0x80) - for j := 7; j >= 0; j-- { - res.Square(&res) - if (w&mask)>>j != 0 { - res.Mul(&res, x) + mask := byte(0xc0) + for j := 0; j < 4; j++ { + res.Square(&res).Square(&res) + c := (w & mask) >> (6 - 2*j) + if c != 0 { + res.Mul(&res, &ops[c-1]) } - mask = mask >> 1 + mask = mask >> 2 } } z.Set(&res) + return z } diff --git a/ecc/bls12-381/internal/fptower/e12.go b/ecc/bls12-381/internal/fptower/e12.go index 8935d7b656..572c97e5bd 100644 --- a/ecc/bls12-381/internal/fptower/e12.go +++ b/ecc/bls12-381/internal/fptower/e12.go @@ -369,22 +369,32 @@ func (z *E12) Inverse(x *E12) *E12 { } // Exp sets z=x**e and returns it +// uses 2-bits windowed method func (z *E12) Exp(x *E12, e big.Int) *E12 { + var res E12 + var ops [3]E12 + res.SetOne() + ops[0].Set(x) + ops[1].Square(&ops[0]) + ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) + b := e.Bytes() for i := range b { w := b[i] - mask := byte(0x80) - for j := 7; j >= 0; j-- { - res.Square(&res) - if (w&mask)>>j != 0 { - res.Mul(&res, x) + mask := byte(0xc0) + for j := 0; j < 4; j++ { + res.Square(&res).Square(&res) + c := (w & mask) >> (6 - 2*j) + if c != 0 { + res.Mul(&res, &ops[c-1]) } - mask = mask >> 1 + mask = mask >> 2 } } z.Set(&res) + return z } diff --git a/ecc/bls24-315/internal/fptower/e24.go b/ecc/bls24-315/internal/fptower/e24.go index ed8444f448..4863295236 100644 --- a/ecc/bls24-315/internal/fptower/e24.go +++ b/ecc/bls24-315/internal/fptower/e24.go @@ -367,22 +367,32 @@ func (z *E24) Inverse(x *E24) *E24 { } // Exp sets z=x**e and returns it +// uses 2-bits windowed method func (z *E24) Exp(x *E24, e big.Int) *E24 { + var res E24 + var ops [3]E24 + res.SetOne() + ops[0].Set(x) + ops[1].Square(&ops[0]) + ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) + b := e.Bytes() for i := range b { w := b[i] - mask := byte(0x80) - for j := 7; j >= 0; j-- { - res.Square(&res) - if (w&mask)>>j != 0 { - res.Mul(&res, x) + mask := byte(0xc0) + for j := 0; j < 4; j++ { + res.Square(&res).Square(&res) + c := (w & mask) >> (6 - 2*j) + if c != 0 { + res.Mul(&res, &ops[c-1]) } - mask = mask >> 1 + mask = mask >> 2 } } z.Set(&res) + return z } diff --git a/ecc/bn254/internal/fptower/e12.go b/ecc/bn254/internal/fptower/e12.go index e29f7d0a93..15e808a9b1 100644 --- a/ecc/bn254/internal/fptower/e12.go +++ b/ecc/bn254/internal/fptower/e12.go @@ -369,22 +369,32 @@ func (z *E12) Inverse(x *E12) *E12 { } // Exp sets z=x**e and returns it +// uses 2-bits windowed method func (z *E12) Exp(x *E12, e big.Int) *E12 { + var res E12 + var ops [3]E12 + res.SetOne() + ops[0].Set(x) + ops[1].Square(&ops[0]) + ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) + b := e.Bytes() for i := range b { w := b[i] - mask := byte(0x80) - for j := 7; j >= 0; j-- { - res.Square(&res) - if (w&mask)>>j != 0 { - res.Mul(&res, x) + mask := byte(0xc0) + for j := 0; j < 4; j++ { + res.Square(&res).Square(&res) + c := (w & mask) >> (6 - 2*j) + if c != 0 { + res.Mul(&res, &ops[c-1]) } - mask = mask >> 1 + mask = mask >> 2 } } z.Set(&res) + return z } diff --git a/ecc/bw6-633/internal/fptower/e6.go b/ecc/bw6-633/internal/fptower/e6.go index c938b91f3f..11d192b912 100644 --- a/ecc/bw6-633/internal/fptower/e6.go +++ b/ecc/bw6-633/internal/fptower/e6.go @@ -312,22 +312,32 @@ func (z *E6) Inverse(x *E6) *E6 { } // Exp sets z=x**e and returns it +// uses 2-bits windowed method func (z *E6) Exp(x *E6, e big.Int) *E6 { + var res E6 + var ops [3]E6 + res.SetOne() + ops[0].Set(x) + ops[1].Square(&ops[0]) + ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) + b := e.Bytes() for i := range b { w := b[i] - mask := byte(0x80) - for j := 7; j >= 0; j-- { - res.Square(&res) - if (w&mask)>>j != 0 { - res.Mul(&res, x) + mask := byte(0xc0) + for j := 0; j < 4; j++ { + res.Square(&res).Square(&res) + c := (w & mask) >> (6 - 2*j) + if c != 0 { + res.Mul(&res, &ops[c-1]) } - mask = mask >> 1 + mask = mask >> 2 } } z.Set(&res) + return z } diff --git a/ecc/bw6-761/internal/fptower/e6.go b/ecc/bw6-761/internal/fptower/e6.go index 6261475843..8dda8eaa86 100644 --- a/ecc/bw6-761/internal/fptower/e6.go +++ b/ecc/bw6-761/internal/fptower/e6.go @@ -311,22 +311,32 @@ func (z *E6) Inverse(x *E6) *E6 { } // Exp sets z=x**e and returns it +// uses 2-bits windowed method func (z *E6) Exp(x *E6, e big.Int) *E6 { + var res E6 + var ops [3]E6 + res.SetOne() + ops[0].Set(x) + ops[1].Square(&ops[0]) + ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) + b := e.Bytes() for i := range b { w := b[i] - mask := byte(0x80) - for j := 7; j >= 0; j-- { - res.Square(&res) - if (w&mask)>>j != 0 { - res.Mul(&res, x) + mask := byte(0xc0) + for j := 0; j < 4; j++ { + res.Square(&res).Square(&res) + c := (w & mask) >> (6 - 2*j) + if c != 0 { + res.Mul(&res, &ops[c-1]) } - mask = mask >> 1 + mask = mask >> 2 } } z.Set(&res) + return z } diff --git a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl index 1982772154..616a393bf0 100644 --- a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl +++ b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl @@ -352,22 +352,32 @@ func (z *E12) Inverse(x *E12) *E12 { } // Exp sets z=x**e and returns it +// uses 2-bits windowed method func (z *E12) Exp(x *E12, e big.Int) *E12 { + var res E12 + var ops [3]E12 + res.SetOne() + ops[0].Set(x) + ops[1].Square(&ops[0]) + ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) + b := e.Bytes() for i := range b { w := b[i] - mask := byte(0x80) - for j := 7; j >= 0; j-- { - res.Square(&res) - if (w&mask)>>j != 0 { - res.Mul(&res, x) + mask := byte(0xc0) + for j := 0; j < 4; j++ { + res.Square(&res).Square(&res) + c := (w & mask) >> (6 - 2*j) + if c != 0 { + res.Mul(&res, &ops[c-1]) } - mask = mask >> 1 + mask = mask >> 2 } } z.Set(&res) + return z } From 1587c99f4f567963fc216c27cfe7b1c6a24e5479 Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Mon, 24 Jan 2022 14:59:50 +0100 Subject: [PATCH 03/16] feat(GT): exponentiation in GT using 2-dim windowed GLV --- ecc/bls12-377/internal/fptower/e12.go | 71 +++++++++++++++++++ ecc/bls12-377/internal/fptower/parameters.go | 33 +++++++++ ecc/bls12-377/pairing_test.go | 59 +++++++++++++++ ecc/bls12-381/internal/fptower/e12.go | 71 +++++++++++++++++++ ecc/bls12-381/internal/fptower/parameters.go | 33 +++++++++ ecc/bls12-381/pairing_test.go | 59 +++++++++++++++ ecc/bls24-315/internal/fptower/e24.go | 71 +++++++++++++++++++ ecc/bls24-315/internal/fptower/parameters.go | 33 +++++++++ ecc/bls24-315/pairing_test.go | 61 ++++++++++++++++ ecc/bn254/internal/fptower/e12.go | 71 +++++++++++++++++++ ecc/bn254/internal/fptower/parameters.go | 33 +++++++++ ecc/bn254/pairing_test.go | 59 +++++++++++++++ ecc/bw6-633/internal/fptower/e6.go | 71 +++++++++++++++++++ ecc/bw6-633/internal/fptower/parameters.go | 33 +++++++++ ecc/bw6-633/pairing_test.go | 61 ++++++++++++++++ ecc/bw6-761/internal/fptower/e6.go | 71 +++++++++++++++++++ ecc/bw6-761/internal/fptower/parameters.go | 33 +++++++++ ecc/bw6-761/pairing_test.go | 61 ++++++++++++++++ .../pairing/template/tests/pairing.go.tmpl | 69 ++++++++++++++++++ .../template/fq12over6over2/fq12.go.tmpl | 71 +++++++++++++++++++ 20 files changed, 1124 insertions(+) create mode 100644 ecc/bls12-377/internal/fptower/parameters.go create mode 100644 ecc/bls12-381/internal/fptower/parameters.go create mode 100644 ecc/bls24-315/internal/fptower/parameters.go create mode 100644 ecc/bn254/internal/fptower/parameters.go create mode 100644 ecc/bw6-633/internal/fptower/parameters.go create mode 100644 ecc/bw6-761/internal/fptower/parameters.go diff --git a/ecc/bls12-377/internal/fptower/e12.go b/ecc/bls12-377/internal/fptower/e12.go index 6939f52eb8..9c1144dea3 100644 --- a/ecc/bls12-377/internal/fptower/e12.go +++ b/ecc/bls12-377/internal/fptower/e12.go @@ -21,6 +21,7 @@ import ( "errors" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-377/fp" + "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" "math/big" ) @@ -399,7 +400,9 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 { } // CyclotomicExp sets z=x**e and returns it +// uses 2-NAF decomposition // x must be in the cyclotomic subgroup +// TODO: use a windowed method func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { var res, xInv E12 xInv.InverseUnitary(x) @@ -418,6 +421,74 @@ func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { return z } +// ExpGLV sets z=x**e and returns it +// uses 2-dimensional GLV with 2-bits windowed method +// x must be in GT +// TODO: use 2-NAF +// TODO: use higher dimensional decomposition +func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 { + + var table [15]E12 + var res E12 + var k1, k2 fr.Element + + res.SetOne() + + // table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a + table[0].Set(a) + table[3].Frobenius(a) + + // split the scalar, modifies +-x, Frob(x) accordingly + k := ecc.SplitScalar(s, &glvBasis) + + if k[0].Sign() == -1 { + k[0].Neg(&k[0]) + table[0].InverseUnitary(&table[0]) + } + if k[1].Sign() == -1 { + k[1].Neg(&k[1]) + table[3].InverseUnitary(&table[3]) + } + + // precompute table (2 bits sliding window) + // table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0 + table[1].CyclotomicSquare(&table[0]) + table[2].Mul(&table[1], &table[0]) + table[4].Mul(&table[3], &table[0]) + table[5].Mul(&table[3], &table[1]) + table[6].Mul(&table[3], &table[2]) + table[7].CyclotomicSquare(&table[3]) + table[8].Mul(&table[7], &table[0]) + table[9].Mul(&table[7], &table[1]) + table[10].Mul(&table[7], &table[2]) + table[11].Mul(&table[7], &table[3]) + table[12].Mul(&table[11], &table[0]) + table[13].Mul(&table[11], &table[1]) + table[14].Mul(&table[11], &table[2]) + + // bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max + k1.SetBigInt(&k[0]).FromMont() + k2.SetBigInt(&k[1]).FromMont() + + // loop starts from len(k1)/2 due to the bounds + for i := len(k1) / 2; i >= 0; i-- { + mask := uint64(3) << 62 + for j := 0; j < 32; j++ { + res.CyclotomicSquare(&res).CyclotomicSquare(&res) + b1 := (k1[i] & mask) >> (62 - 2*j) + b2 := (k2[i] & mask) >> (62 - 2*j) + if b1|b2 != 0 { + s := (b2<<2 | b1) + res.Mul(&res, &table[s-1]) + } + mask = mask >> 2 + } + } + + p.Set(&res) + return p +} + // InverseUnitary inverse a unitary element func (z *E12) InverseUnitary(x *E12) *E12 { return z.Conjugate(x) diff --git a/ecc/bls12-377/internal/fptower/parameters.go b/ecc/bls12-377/internal/fptower/parameters.go new file mode 100644 index 0000000000..6f2b1d6c7e --- /dev/null +++ b/ecc/bls12-377/internal/fptower/parameters.go @@ -0,0 +1,33 @@ +// Copyright 2020 ConsenSys AG +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fptower + +import ( + "math/big" + + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" +) + +// generator of the curve +var xGen big.Int + +var glvBasis ecc.Lattice + +func init() { + xGen.SetString("9586122913090633729", 10) + _r := fr.Modulus() + ecc.PrecomputeLattice(_r, &xGen, &glvBasis) +} diff --git a/ecc/bls12-377/pairing_test.go b/ecc/bls12-377/pairing_test.go index 96af27ede2..61af6eec91 100644 --- a/ecc/bls12-377/pairing_test.go +++ b/ecc/bls12-377/pairing_test.go @@ -21,6 +21,7 @@ import ( "math/big" "testing" + "github.com/consensys/gnark-crypto/ecc/bls12-377/fp" "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" @@ -39,6 +40,7 @@ func TestPairing(t *testing.T) { genA := GenE12() genR1 := GenFr() genR2 := GenFr() + genP := GenFp() properties.Property("[BLS12-377] Having the receiver as operand (final expo) should output the same result", prop.ForAll( func(a GT) bool { @@ -58,6 +60,27 @@ func TestPairing(t *testing.T) { genA, )) + properties.Property("[BLS12-377] Exp, CyclotomicExp and ExpGLV results must be the same in GT", prop.ForAll( + func(a GT, e fp.Element) bool { + a = FinalExponentiation(&a) + + var _e big.Int + + k := new(big.Int).SetUint64(12) + e.Exp(e, k) + e.ToBigIntRegular(&_e) + + var b, c, d GT + b.Exp(&a, _e) + c.ExpGLV(&a, &_e) + d.CyclotomicExp(&a, _e) + + return b.Equal(&c) && c.Equal(&d) + }, + genA, + genP, + )) + properties.Property("[BLS12-377] Expt(Expt) and Exp(t^2) should output the same result in the cyclotomic subgroup", prop.ForAll( func(a GT) bool { var b, c, d GT @@ -303,3 +326,39 @@ func BenchmarkMultiPair(b *testing.B) { }) } } + +func BenchmarkExpGT(b *testing.B) { + + var a GT + a.SetRandom() + a = FinalExponentiation(&a) + + var e fp.Element + e.SetRandom() + + k := new(big.Int).SetUint64(12) + e.Exp(e, k) + var _e big.Int + e.ToBigIntRegular(&_e) + + b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.Exp(&a, _e) + } + }) + + b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.CyclotomicExp(&a, _e) + } + }) + + b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.ExpGLV(&a, &_e) + } + }) +} diff --git a/ecc/bls12-381/internal/fptower/e12.go b/ecc/bls12-381/internal/fptower/e12.go index 572c97e5bd..92345294e6 100644 --- a/ecc/bls12-381/internal/fptower/e12.go +++ b/ecc/bls12-381/internal/fptower/e12.go @@ -21,6 +21,7 @@ import ( "errors" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-381/fp" + "github.com/consensys/gnark-crypto/ecc/bls12-381/fr" "math/big" ) @@ -399,7 +400,9 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 { } // CyclotomicExp sets z=x**e and returns it +// uses 2-NAF decomposition // x must be in the cyclotomic subgroup +// TODO: use a windowed method func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { var res, xInv E12 xInv.InverseUnitary(x) @@ -418,6 +421,74 @@ func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { return z } +// ExpGLV sets z=x**e and returns it +// uses 2-dimensional GLV with 2-bits windowed method +// x must be in GT +// TODO: use 2-NAF +// TODO: use higher dimensional decomposition +func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 { + + var table [15]E12 + var res E12 + var k1, k2 fr.Element + + res.SetOne() + + // table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a + table[0].Set(a) + table[3].Frobenius(a) + + // split the scalar, modifies +-x, Frob(x) accordingly + k := ecc.SplitScalar(s, &glvBasis) + + if k[0].Sign() == -1 { + k[0].Neg(&k[0]) + table[0].InverseUnitary(&table[0]) + } + if k[1].Sign() == -1 { + k[1].Neg(&k[1]) + table[3].InverseUnitary(&table[3]) + } + + // precompute table (2 bits sliding window) + // table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0 + table[1].CyclotomicSquare(&table[0]) + table[2].Mul(&table[1], &table[0]) + table[4].Mul(&table[3], &table[0]) + table[5].Mul(&table[3], &table[1]) + table[6].Mul(&table[3], &table[2]) + table[7].CyclotomicSquare(&table[3]) + table[8].Mul(&table[7], &table[0]) + table[9].Mul(&table[7], &table[1]) + table[10].Mul(&table[7], &table[2]) + table[11].Mul(&table[7], &table[3]) + table[12].Mul(&table[11], &table[0]) + table[13].Mul(&table[11], &table[1]) + table[14].Mul(&table[11], &table[2]) + + // bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max + k1.SetBigInt(&k[0]).FromMont() + k2.SetBigInt(&k[1]).FromMont() + + // loop starts from len(k1)/2 due to the bounds + for i := len(k1) / 2; i >= 0; i-- { + mask := uint64(3) << 62 + for j := 0; j < 32; j++ { + res.CyclotomicSquare(&res).CyclotomicSquare(&res) + b1 := (k1[i] & mask) >> (62 - 2*j) + b2 := (k2[i] & mask) >> (62 - 2*j) + if b1|b2 != 0 { + s := (b2<<2 | b1) + res.Mul(&res, &table[s-1]) + } + mask = mask >> 2 + } + } + + p.Set(&res) + return p +} + // InverseUnitary inverse a unitary element func (z *E12) InverseUnitary(x *E12) *E12 { return z.Conjugate(x) diff --git a/ecc/bls12-381/internal/fptower/parameters.go b/ecc/bls12-381/internal/fptower/parameters.go new file mode 100644 index 0000000000..9f97e11751 --- /dev/null +++ b/ecc/bls12-381/internal/fptower/parameters.go @@ -0,0 +1,33 @@ +// Copyright 2020 ConsenSys AG +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fptower + +import ( + "math/big" + + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bls12-381/fr" +) + +// generator of the curve +var xGen big.Int + +var glvBasis ecc.Lattice + +func init() { + xGen.SetString("-15132376222941642752", 10) + _r := fr.Modulus() + ecc.PrecomputeLattice(_r, &xGen, &glvBasis) +} diff --git a/ecc/bls12-381/pairing_test.go b/ecc/bls12-381/pairing_test.go index cda5bd3f94..c45923175d 100644 --- a/ecc/bls12-381/pairing_test.go +++ b/ecc/bls12-381/pairing_test.go @@ -21,6 +21,7 @@ import ( "math/big" "testing" + "github.com/consensys/gnark-crypto/ecc/bls12-381/fp" "github.com/consensys/gnark-crypto/ecc/bls12-381/fr" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" @@ -39,6 +40,7 @@ func TestPairing(t *testing.T) { genA := GenE12() genR1 := GenFr() genR2 := GenFr() + genP := GenFp() properties.Property("[BLS12-381] Having the receiver as operand (final expo) should output the same result", prop.ForAll( func(a GT) bool { @@ -58,6 +60,27 @@ func TestPairing(t *testing.T) { genA, )) + properties.Property("[BLS12-381] Exp, CyclotomicExp and ExpGLV results must be the same in GT", prop.ForAll( + func(a GT, e fp.Element) bool { + a = FinalExponentiation(&a) + + var _e big.Int + + k := new(big.Int).SetUint64(12) + e.Exp(e, k) + e.ToBigIntRegular(&_e) + + var b, c, d GT + b.Exp(&a, _e) + c.ExpGLV(&a, &_e) + d.CyclotomicExp(&a, _e) + + return b.Equal(&c) && c.Equal(&d) + }, + genA, + genP, + )) + properties.Property("[BLS12-381] Expt(Expt) and Exp(t^2) should output the same result in the cyclotomic subgroup", prop.ForAll( func(a GT) bool { var b, c, d GT @@ -303,3 +326,39 @@ func BenchmarkMultiPair(b *testing.B) { }) } } + +func BenchmarkExpGT(b *testing.B) { + + var a GT + a.SetRandom() + a = FinalExponentiation(&a) + + var e fp.Element + e.SetRandom() + + k := new(big.Int).SetUint64(12) + e.Exp(e, k) + var _e big.Int + e.ToBigIntRegular(&_e) + + b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.Exp(&a, _e) + } + }) + + b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.CyclotomicExp(&a, _e) + } + }) + + b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.ExpGLV(&a, &_e) + } + }) +} diff --git a/ecc/bls24-315/internal/fptower/e24.go b/ecc/bls24-315/internal/fptower/e24.go index 4863295236..ad4abd0661 100644 --- a/ecc/bls24-315/internal/fptower/e24.go +++ b/ecc/bls24-315/internal/fptower/e24.go @@ -19,6 +19,7 @@ package fptower import ( "errors" "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bls24-315/fr" "math/big" ) @@ -397,7 +398,9 @@ func (z *E24) Exp(x *E24, e big.Int) *E24 { } // CyclotomicExp sets z=x**e and returns it +// uses 2-NAF decomposition // x must be in the cyclotomic subgroup +// TODO: use a windowed method func (z *E24) CyclotomicExp(x *E24, e big.Int) *E24 { var res, xInv E24 xInv.InverseUnitary(x) @@ -416,6 +419,74 @@ func (z *E24) CyclotomicExp(x *E24, e big.Int) *E24 { return z } +// ExpGLV sets z=x**e and returns it +// uses 2-dimensional GLV with 2-bits windowed method +// x must be in GT +// TODO: use 2-NAF +// TODO: use higher dimensional decomposition +func (p *E24) ExpGLV(a *E24, s *big.Int) *E24 { + + var table [15]E24 + var res E24 + var k1, k2 fr.Element + + res.SetOne() + + // table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a + table[0].Set(a) + table[3].Frobenius(a) + + // split the scalar, modifies +-x, Frob(x) accordingly + k := ecc.SplitScalar(s, &glvBasis) + + if k[0].Sign() == -1 { + k[0].Neg(&k[0]) + table[0].InverseUnitary(&table[0]) + } + if k[1].Sign() == -1 { + k[1].Neg(&k[1]) + table[3].InverseUnitary(&table[3]) + } + + // precompute table (2 bits sliding window) + // table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0 + table[1].CyclotomicSquare(&table[0]) + table[2].Mul(&table[1], &table[0]) + table[4].Mul(&table[3], &table[0]) + table[5].Mul(&table[3], &table[1]) + table[6].Mul(&table[3], &table[2]) + table[7].CyclotomicSquare(&table[3]) + table[8].Mul(&table[7], &table[0]) + table[9].Mul(&table[7], &table[1]) + table[10].Mul(&table[7], &table[2]) + table[11].Mul(&table[7], &table[3]) + table[12].Mul(&table[11], &table[0]) + table[13].Mul(&table[11], &table[1]) + table[14].Mul(&table[11], &table[2]) + + // bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max + k1.SetBigInt(&k[0]).FromMont() + k2.SetBigInt(&k[1]).FromMont() + + // loop starts from len(k1)/2 due to the bounds + for i := len(k1)/2 + 1; i >= 0; i-- { + mask := uint64(3) << 62 + for j := 0; j < 32; j++ { + res.CyclotomicSquare(&res).CyclotomicSquare(&res) + b1 := (k1[i] & mask) >> (62 - 2*j) + b2 := (k2[i] & mask) >> (62 - 2*j) + if b1|b2 != 0 { + s := (b2<<2 | b1) + res.Mul(&res, &table[s-1]) + } + mask = mask >> 2 + } + } + + p.Set(&res) + return p +} + // InverseUnitary inverse a unitary element func (z *E24) InverseUnitary(x *E24) *E24 { return z.Conjugate(x) diff --git a/ecc/bls24-315/internal/fptower/parameters.go b/ecc/bls24-315/internal/fptower/parameters.go new file mode 100644 index 0000000000..71ac9072cd --- /dev/null +++ b/ecc/bls24-315/internal/fptower/parameters.go @@ -0,0 +1,33 @@ +// Copyright 2020 ConsenSys AG +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fptower + +import ( + "math/big" + + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bls24-315/fr" +) + +// generator of the curve +var xGen big.Int + +var glvBasis ecc.Lattice + +func init() { + xGen.SetString("-3218079743", 10) + _r := fr.Modulus() + ecc.PrecomputeLattice(_r, &xGen, &glvBasis) +} diff --git a/ecc/bls24-315/pairing_test.go b/ecc/bls24-315/pairing_test.go index e54fca70b4..7f2b9e04df 100644 --- a/ecc/bls24-315/pairing_test.go +++ b/ecc/bls24-315/pairing_test.go @@ -21,6 +21,7 @@ import ( "math/big" "testing" + "github.com/consensys/gnark-crypto/ecc/bls24-315/fp" "github.com/consensys/gnark-crypto/ecc/bls24-315/fr" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" @@ -40,6 +41,7 @@ func TestPairing(t *testing.T) { genR1 := GenFr() genR2 := GenFr() + genP := GenFp() properties.Property("[BLS24-315] Having the receiver as operand (final expo) should output the same result", prop.ForAll( func(a GT) bool { @@ -59,6 +61,28 @@ func TestPairing(t *testing.T) { genA, )) + properties.Property("[BLS24-315] Exp, CyclotomicExp and ExpGLV results must be the same in GT", prop.ForAll( + func(a GT, e fp.Element) bool { + a = FinalExponentiation(&a) + + var _e big.Int + + k := new(big.Int).SetUint64(24) + + e.Exp(e, k) + e.ToBigIntRegular(&_e) + + var b, c, d GT + b.Exp(&a, _e) + c.ExpGLV(&a, &_e) + d.CyclotomicExp(&a, _e) + + return b.Equal(&c) && c.Equal(&d) + }, + genA, + genP, + )) + properties.Property("[BLS24-315] Expt(Expt) and Exp(t^2) should output the same result in the cyclotomic subgroup", prop.ForAll( func(a GT) bool { var b, c, d GT @@ -304,3 +328,40 @@ func BenchmarkMultiPair(b *testing.B) { }) } } + +func BenchmarkExpGT(b *testing.B) { + + var a GT + a.SetRandom() + a = FinalExponentiation(&a) + + var e fp.Element + e.SetRandom() + + k := new(big.Int).SetUint64(24) + + e.Exp(e, k) + var _e big.Int + e.ToBigIntRegular(&_e) + + b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.Exp(&a, _e) + } + }) + + b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.CyclotomicExp(&a, _e) + } + }) + + b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.ExpGLV(&a, &_e) + } + }) +} diff --git a/ecc/bn254/internal/fptower/e12.go b/ecc/bn254/internal/fptower/e12.go index 15e808a9b1..a0afb02df2 100644 --- a/ecc/bn254/internal/fptower/e12.go +++ b/ecc/bn254/internal/fptower/e12.go @@ -21,6 +21,7 @@ import ( "errors" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bn254/fp" + "github.com/consensys/gnark-crypto/ecc/bn254/fr" "math/big" ) @@ -399,7 +400,9 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 { } // CyclotomicExp sets z=x**e and returns it +// uses 2-NAF decomposition // x must be in the cyclotomic subgroup +// TODO: use a windowed method func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { var res, xInv E12 xInv.InverseUnitary(x) @@ -418,6 +421,74 @@ func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { return z } +// ExpGLV sets z=x**e and returns it +// uses 2-dimensional GLV with 2-bits windowed method +// x must be in GT +// TODO: use 2-NAF +// TODO: use higher dimensional decomposition +func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 { + + var table [15]E12 + var res E12 + var k1, k2 fr.Element + + res.SetOne() + + // table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a + table[0].Set(a) + table[3].Frobenius(a) + + // split the scalar, modifies +-x, Frob(x) accordingly + k := ecc.SplitScalar(s, &glvBasis) + + if k[0].Sign() == -1 { + k[0].Neg(&k[0]) + table[0].InverseUnitary(&table[0]) + } + if k[1].Sign() == -1 { + k[1].Neg(&k[1]) + table[3].InverseUnitary(&table[3]) + } + + // precompute table (2 bits sliding window) + // table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0 + table[1].CyclotomicSquare(&table[0]) + table[2].Mul(&table[1], &table[0]) + table[4].Mul(&table[3], &table[0]) + table[5].Mul(&table[3], &table[1]) + table[6].Mul(&table[3], &table[2]) + table[7].CyclotomicSquare(&table[3]) + table[8].Mul(&table[7], &table[0]) + table[9].Mul(&table[7], &table[1]) + table[10].Mul(&table[7], &table[2]) + table[11].Mul(&table[7], &table[3]) + table[12].Mul(&table[11], &table[0]) + table[13].Mul(&table[11], &table[1]) + table[14].Mul(&table[11], &table[2]) + + // bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max + k1.SetBigInt(&k[0]).FromMont() + k2.SetBigInt(&k[1]).FromMont() + + // loop starts from len(k1)/2 due to the bounds + for i := len(k1) / 2; i >= 0; i-- { + mask := uint64(3) << 62 + for j := 0; j < 32; j++ { + res.CyclotomicSquare(&res).CyclotomicSquare(&res) + b1 := (k1[i] & mask) >> (62 - 2*j) + b2 := (k2[i] & mask) >> (62 - 2*j) + if b1|b2 != 0 { + s := (b2<<2 | b1) + res.Mul(&res, &table[s-1]) + } + mask = mask >> 2 + } + } + + p.Set(&res) + return p +} + // InverseUnitary inverse a unitary element func (z *E12) InverseUnitary(x *E12) *E12 { return z.Conjugate(x) diff --git a/ecc/bn254/internal/fptower/parameters.go b/ecc/bn254/internal/fptower/parameters.go new file mode 100644 index 0000000000..3859aeac76 --- /dev/null +++ b/ecc/bn254/internal/fptower/parameters.go @@ -0,0 +1,33 @@ +// Copyright 2020 ConsenSys AG +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fptower + +import ( + "math/big" + + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bn254/fr" +) + +// generator of the curve +var xGen big.Int + +var glvBasis ecc.Lattice + +func init() { + xGen.SetString("147946756881789318990833708069417712966", 10) + _r := fr.Modulus() + ecc.PrecomputeLattice(_r, &xGen, &glvBasis) +} diff --git a/ecc/bn254/pairing_test.go b/ecc/bn254/pairing_test.go index 8948b1872f..a875603cde 100644 --- a/ecc/bn254/pairing_test.go +++ b/ecc/bn254/pairing_test.go @@ -21,6 +21,7 @@ import ( "math/big" "testing" + "github.com/consensys/gnark-crypto/ecc/bn254/fp" "github.com/consensys/gnark-crypto/ecc/bn254/fr" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" @@ -39,6 +40,7 @@ func TestPairing(t *testing.T) { genA := GenE12() genR1 := GenFr() genR2 := GenFr() + genP := GenFp() properties.Property("[BN254] Having the receiver as operand (final expo) should output the same result", prop.ForAll( func(a GT) bool { @@ -58,6 +60,27 @@ func TestPairing(t *testing.T) { genA, )) + properties.Property("[BN254] Exp, CyclotomicExp and ExpGLV results must be the same in GT", prop.ForAll( + func(a GT, e fp.Element) bool { + a = FinalExponentiation(&a) + + var _e big.Int + + k := new(big.Int).SetUint64(12) + e.Exp(e, k) + e.ToBigIntRegular(&_e) + + var b, c, d GT + b.Exp(&a, _e) + c.ExpGLV(&a, &_e) + d.CyclotomicExp(&a, _e) + + return b.Equal(&c) && c.Equal(&d) + }, + genA, + genP, + )) + properties.Property("[BN254] Expt(Expt) and Exp(t^2) should output the same result in the cyclotomic subgroup", prop.ForAll( func(a GT) bool { var b, c, d GT @@ -303,3 +326,39 @@ func BenchmarkMultiPair(b *testing.B) { }) } } + +func BenchmarkExpGT(b *testing.B) { + + var a GT + a.SetRandom() + a = FinalExponentiation(&a) + + var e fp.Element + e.SetRandom() + + k := new(big.Int).SetUint64(12) + e.Exp(e, k) + var _e big.Int + e.ToBigIntRegular(&_e) + + b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.Exp(&a, _e) + } + }) + + b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.CyclotomicExp(&a, _e) + } + }) + + b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.ExpGLV(&a, &_e) + } + }) +} diff --git a/ecc/bw6-633/internal/fptower/e6.go b/ecc/bw6-633/internal/fptower/e6.go index 11d192b912..6003ad5a8d 100644 --- a/ecc/bw6-633/internal/fptower/e6.go +++ b/ecc/bw6-633/internal/fptower/e6.go @@ -22,6 +22,7 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-633/fp" + "github.com/consensys/gnark-crypto/ecc/bw6-633/fr" ) // E6 is a degree two finite field extension of fp3 @@ -342,7 +343,9 @@ func (z *E6) Exp(x *E6, e big.Int) *E6 { } // CyclotomicExp sets z=x**e and returns it +// uses 2-NAF decomposition // x must be in the cyclotomic subgroup +// TODO: use a windowed method func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 { var res, xInv E6 xInv.InverseUnitary(x) @@ -361,6 +364,74 @@ func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 { return z } +// ExpGLV sets z=x**e and returns it +// uses 2-dimensional GLV with 2-bits windowed method +// x must be in GT +// TODO: use 2-NAF +// TODO: use higher dimensional decomposition +func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 { + + var table [15]E6 + var res E6 + var k1, k2 fr.Element + + res.SetOne() + + // table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a + table[0].Set(a) + table[3].Frobenius(a) + + // split the scalar, modifies +-x, Frob(x) accordingly + k := ecc.SplitScalar(s, &glvBasis) + + if k[0].Sign() == -1 { + k[0].Neg(&k[0]) + table[0].InverseUnitary(&table[0]) + } + if k[1].Sign() == -1 { + k[1].Neg(&k[1]) + table[3].InverseUnitary(&table[3]) + } + + // precompute table (2 bits sliding window) + // table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0 + table[1].CyclotomicSquare(&table[0]) + table[2].Mul(&table[1], &table[0]) + table[4].Mul(&table[3], &table[0]) + table[5].Mul(&table[3], &table[1]) + table[6].Mul(&table[3], &table[2]) + table[7].CyclotomicSquare(&table[3]) + table[8].Mul(&table[7], &table[0]) + table[9].Mul(&table[7], &table[1]) + table[10].Mul(&table[7], &table[2]) + table[11].Mul(&table[7], &table[3]) + table[12].Mul(&table[11], &table[0]) + table[13].Mul(&table[11], &table[1]) + table[14].Mul(&table[11], &table[2]) + + // bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max + k1.SetBigInt(&k[0]).FromMont() + k2.SetBigInt(&k[1]).FromMont() + + // loop starts from len(k1)/2 due to the bounds + for i := len(k1) / 2; i >= 0; i-- { + mask := uint64(3) << 62 + for j := 0; j < 32; j++ { + res.CyclotomicSquare(&res).CyclotomicSquare(&res) + b1 := (k1[i] & mask) >> (62 - 2*j) + b2 := (k2[i] & mask) >> (62 - 2*j) + if b1|b2 != 0 { + s := (b2<<2 | b1) + res.Mul(&res, &table[s-1]) + } + mask = mask >> 2 + } + } + + p.Set(&res) + return p +} + // InverseUnitary inverse a unitary element func (z *E6) InverseUnitary(x *E6) *E6 { return z.Conjugate(x) diff --git a/ecc/bw6-633/internal/fptower/parameters.go b/ecc/bw6-633/internal/fptower/parameters.go new file mode 100644 index 0000000000..a929cac1f1 --- /dev/null +++ b/ecc/bw6-633/internal/fptower/parameters.go @@ -0,0 +1,33 @@ +// Copyright 2020 ConsenSys AG +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fptower + +import ( + "math/big" + + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bw6-633/fr" +) + +// generator of the curve +var xGen big.Int + +var glvBasis ecc.Lattice + +func init() { + xGen.SetString("37014442673353839783463348892746893664389658635873267609916377398480286678854893830143", 10) + _r := fr.Modulus() + ecc.PrecomputeLattice(_r, &xGen, &glvBasis) +} diff --git a/ecc/bw6-633/pairing_test.go b/ecc/bw6-633/pairing_test.go index bdffe61d81..fef2c615a4 100644 --- a/ecc/bw6-633/pairing_test.go +++ b/ecc/bw6-633/pairing_test.go @@ -21,6 +21,7 @@ import ( "math/big" "testing" + "github.com/consensys/gnark-crypto/ecc/bw6-633/fp" "github.com/consensys/gnark-crypto/ecc/bw6-633/fr" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" @@ -40,6 +41,7 @@ func TestPairing(t *testing.T) { genR1 := GenFr() genR2 := GenFr() + genP := GenFp() properties.Property("[BW6-633] Having the receiver as operand (final expo) should output the same result", prop.ForAll( func(a GT) bool { @@ -59,6 +61,28 @@ func TestPairing(t *testing.T) { genA, )) + properties.Property("[BW6-633] Exp, CyclotomicExp and ExpGLV results must be the same in GT", prop.ForAll( + func(a GT, e fp.Element) bool { + a = FinalExponentiation(&a) + + var _e big.Int + + k := new(big.Int).SetUint64(6) + + e.Exp(e, k) + e.ToBigIntRegular(&_e) + + var b, c, d GT + b.Exp(&a, _e) + c.ExpGLV(&a, &_e) + d.CyclotomicExp(&a, _e) + + return b.Equal(&c) && c.Equal(&d) + }, + genA, + genP, + )) + properties.Property("[BW6-633] Expt(Expt) and Exp(t^2) should output the same result in the cyclotomic subgroup", prop.ForAll( func(a GT) bool { var b, c, d GT @@ -304,3 +328,40 @@ func BenchmarkMultiPair(b *testing.B) { }) } } + +func BenchmarkExpGT(b *testing.B) { + + var a GT + a.SetRandom() + a = FinalExponentiation(&a) + + var e fp.Element + e.SetRandom() + + k := new(big.Int).SetUint64(6) + + e.Exp(e, k) + var _e big.Int + e.ToBigIntRegular(&_e) + + b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.Exp(&a, _e) + } + }) + + b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.CyclotomicExp(&a, _e) + } + }) + + b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.ExpGLV(&a, &_e) + } + }) +} diff --git a/ecc/bw6-761/internal/fptower/e6.go b/ecc/bw6-761/internal/fptower/e6.go index 8dda8eaa86..a2cf6e4c47 100644 --- a/ecc/bw6-761/internal/fptower/e6.go +++ b/ecc/bw6-761/internal/fptower/e6.go @@ -22,6 +22,7 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-761/fp" + "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" ) // E6 is a degree two finite field extension of fp3 @@ -341,7 +342,9 @@ func (z *E6) Exp(x *E6, e big.Int) *E6 { } // CyclotomicExp sets z=x**e and returns it +// uses 2-NAF decomposition // x must be in the cyclotomic subgroup +// TODO: use a windowed method func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 { var res, xInv E6 xInv.InverseUnitary(x) @@ -360,6 +363,74 @@ func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 { return z } +// ExpGLV sets z=x**e and returns it +// uses 2-dimensional GLV with 2-bits windowed method +// x must be in GT +// TODO: use 2-NAF +// TODO: use higher dimensional decomposition +func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 { + + var table [15]E6 + var res E6 + var k1, k2 fr.Element + + res.SetOne() + + // table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a + table[0].Set(a) + table[3].Frobenius(a) + + // split the scalar, modifies +-x, Frob(x) accordingly + k := ecc.SplitScalar(s, &glvBasis) + + if k[0].Sign() == -1 { + k[0].Neg(&k[0]) + table[0].InverseUnitary(&table[0]) + } + if k[1].Sign() == -1 { + k[1].Neg(&k[1]) + table[3].InverseUnitary(&table[3]) + } + + // precompute table (2 bits sliding window) + // table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0 + table[1].CyclotomicSquare(&table[0]) + table[2].Mul(&table[1], &table[0]) + table[4].Mul(&table[3], &table[0]) + table[5].Mul(&table[3], &table[1]) + table[6].Mul(&table[3], &table[2]) + table[7].CyclotomicSquare(&table[3]) + table[8].Mul(&table[7], &table[0]) + table[9].Mul(&table[7], &table[1]) + table[10].Mul(&table[7], &table[2]) + table[11].Mul(&table[7], &table[3]) + table[12].Mul(&table[11], &table[0]) + table[13].Mul(&table[11], &table[1]) + table[14].Mul(&table[11], &table[2]) + + // bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max + k1.SetBigInt(&k[0]).FromMont() + k2.SetBigInt(&k[1]).FromMont() + + // loop starts from len(k1)/2 due to the bounds + for i := len(k1) / 2; i >= 0; i-- { + mask := uint64(3) << 62 + for j := 0; j < 32; j++ { + res.CyclotomicSquare(&res).CyclotomicSquare(&res) + b1 := (k1[i] & mask) >> (62 - 2*j) + b2 := (k2[i] & mask) >> (62 - 2*j) + if b1|b2 != 0 { + s := (b2<<2 | b1) + res.Mul(&res, &table[s-1]) + } + mask = mask >> 2 + } + } + + p.Set(&res) + return p +} + // InverseUnitary inverse a unitary element func (z *E6) InverseUnitary(x *E6) *E6 { return z.Conjugate(x) diff --git a/ecc/bw6-761/internal/fptower/parameters.go b/ecc/bw6-761/internal/fptower/parameters.go new file mode 100644 index 0000000000..2ec4ef2b19 --- /dev/null +++ b/ecc/bw6-761/internal/fptower/parameters.go @@ -0,0 +1,33 @@ +// Copyright 2020 ConsenSys AG +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fptower + +import ( + "math/big" + + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" +) + +// generator of the curve +var xGen big.Int + +var glvBasis ecc.Lattice + +func init() { + xGen.SetString("3362637538168598222219435186298528655381674028954528064283340709388076588006567983337308081752755143497537638367247", 10) + _r := fr.Modulus() + ecc.PrecomputeLattice(_r, &xGen, &glvBasis) +} diff --git a/ecc/bw6-761/pairing_test.go b/ecc/bw6-761/pairing_test.go index 2ec8af9ac7..14dc68d169 100644 --- a/ecc/bw6-761/pairing_test.go +++ b/ecc/bw6-761/pairing_test.go @@ -21,6 +21,7 @@ import ( "math/big" "testing" + "github.com/consensys/gnark-crypto/ecc/bw6-761/fp" "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" @@ -40,6 +41,7 @@ func TestPairing(t *testing.T) { genR1 := GenFr() genR2 := GenFr() + genP := GenFp() properties.Property("[BW6-761] Having the receiver as operand (final expo) should output the same result", prop.ForAll( func(a GT) bool { @@ -59,6 +61,28 @@ func TestPairing(t *testing.T) { genA, )) + properties.Property("[BW6-761] Exp, CyclotomicExp and ExpGLV results must be the same in GT", prop.ForAll( + func(a GT, e fp.Element) bool { + a = FinalExponentiation(&a) + + var _e big.Int + + k := new(big.Int).SetUint64(6) + + e.Exp(e, k) + e.ToBigIntRegular(&_e) + + var b, c, d GT + b.Exp(&a, _e) + c.ExpGLV(&a, &_e) + d.CyclotomicExp(&a, _e) + + return b.Equal(&c) && c.Equal(&d) + }, + genA, + genP, + )) + properties.Property("[BW6-761] Expt(Expt) and Exp(t^2) should output the same result in the cyclotomic subgroup", prop.ForAll( func(a GT) bool { var b, c, d GT @@ -304,3 +328,40 @@ func BenchmarkMultiPair(b *testing.B) { }) } } + +func BenchmarkExpGT(b *testing.B) { + + var a GT + a.SetRandom() + a = FinalExponentiation(&a) + + var e fp.Element + e.SetRandom() + + k := new(big.Int).SetUint64(6) + + e.Exp(e, k) + var _e big.Int + e.ToBigIntRegular(&_e) + + b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.Exp(&a, _e) + } + }) + + b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.CyclotomicExp(&a, _e) + } + }) + + b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.ExpGLV(&a, &_e) + } + }) +} diff --git a/internal/generator/pairing/template/tests/pairing.go.tmpl b/internal/generator/pairing/template/tests/pairing.go.tmpl index 09a0840a96..fcf4058bc7 100644 --- a/internal/generator/pairing/template/tests/pairing.go.tmpl +++ b/internal/generator/pairing/template/tests/pairing.go.tmpl @@ -4,6 +4,7 @@ import ( "testing" "github.com/consensys/gnark-crypto/ecc/{{.Name}}/fr" + "github.com/consensys/gnark-crypto/ecc/{{.Name}}/fp" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" ) @@ -27,6 +28,7 @@ func TestPairing(t *testing.T) { {{- end}} genR1 := GenFr() genR2 := GenFr() + genP := GenFp() properties.Property("[{{ toUpper .Name}}] Having the receiver as operand (final expo) should output the same result", prop.ForAll( func(a GT) bool { @@ -46,6 +48,32 @@ func TestPairing(t *testing.T) { genA, )) + properties.Property("[{{ toUpper .Name}}] Exp, CyclotomicExp and ExpGLV results must be the same in GT", prop.ForAll( + func(a GT, e fp.Element) bool { + a = FinalExponentiation(&a) + + var _e big.Int + {{if or (eq .Name "bw6-761") (eq .Name "bw6-633")}} + k := new(big.Int).SetUint64(6) + {{else if eq .Name "bls24-315"}} + k := new(big.Int).SetUint64(24) + {{ else }} + k := new(big.Int).SetUint64(12) + {{- end}} + e.Exp(e, k) + e.ToBigIntRegular(&_e) + + var b, c, d GT + b.Exp(&a, _e) + c.ExpGLV(&a, &_e) + d.CyclotomicExp(&a, _e) + + return b.Equal(&c) && c.Equal(&d) + }, + genA, + genP, + )) + properties.Property("[{{ toUpper .Name}}] Expt(Expt) and Exp(t^2) should output the same result in the cyclotomic subgroup", prop.ForAll( func(a GT) bool { var b, c, d GT @@ -296,3 +324,44 @@ func BenchmarkMultiPair(b *testing.B) { }) } } + +func BenchmarkExpGT(b *testing.B) { + + var a GT + a.SetRandom() + a = FinalExponentiation(&a) + + var e fp.Element + e.SetRandom() + {{if or (eq .Name "bw6-761") (eq .Name "bw6-633")}} + k := new(big.Int).SetUint64(6) + {{else if eq .Name "bls24-315"}} + k := new(big.Int).SetUint64(24) + {{ else }} + k := new(big.Int).SetUint64(12) + {{- end}} + e.Exp(e, k) + var _e big.Int + e.ToBigIntRegular(&_e) + + b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.Exp(&a, _e) + } + }) + + b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.CyclotomicExp(&a, _e) + } + }) + + b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.ExpGLV(&a, &_e) + } + }) +} diff --git a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl index 616a393bf0..25f9b4eaf9 100644 --- a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl +++ b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl @@ -4,6 +4,7 @@ import ( "errors" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/{{.Name}}/fp" + "github.com/consensys/gnark-crypto/ecc/{{.Name}}/fr" ) // E12 is a degree two finite field extension of fp6 @@ -382,7 +383,9 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 { } // CyclotomicExp sets z=x**e and returns it +// uses 2-NAF decomposition // x must be in the cyclotomic subgroup +// TODO: use a windowed method func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { var res, xInv E12 xInv.InverseUnitary(x) @@ -401,6 +404,74 @@ func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { return z } +// ExpGLV sets z=x**e and returns it +// uses 2-dimensional GLV with 2-bits windowed method +// x must be in GT +// TODO: use 2-NAF +// TODO: use higher dimensional decomposition +func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 { + + var table [15]E12 + var res E12 + var k1, k2 fr.Element + + res.SetOne() + + // table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a + table[0].Set(a) + table[3].Frobenius(a) + + // split the scalar, modifies +-x, Frob(x) accordingly + k := ecc.SplitScalar(s, &glvBasis) + + if k[0].Sign() == -1 { + k[0].Neg(&k[0]) + table[0].InverseUnitary(&table[0]) + } + if k[1].Sign() == -1 { + k[1].Neg(&k[1]) + table[3].InverseUnitary(&table[3]) + } + + // precompute table (2 bits sliding window) + // table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0 + table[1].CyclotomicSquare(&table[0]) + table[2].Mul(&table[1], &table[0]) + table[4].Mul(&table[3], &table[0]) + table[5].Mul(&table[3], &table[1]) + table[6].Mul(&table[3], &table[2]) + table[7].CyclotomicSquare(&table[3]) + table[8].Mul(&table[7], &table[0]) + table[9].Mul(&table[7], &table[1]) + table[10].Mul(&table[7], &table[2]) + table[11].Mul(&table[7], &table[3]) + table[12].Mul(&table[11], &table[0]) + table[13].Mul(&table[11], &table[1]) + table[14].Mul(&table[11], &table[2]) + + // bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max + k1.SetBigInt(&k[0]).FromMont() + k2.SetBigInt(&k[1]).FromMont() + + // loop starts from len(k1)/2 due to the bounds + for i := len(k1) / 2; i >= 0; i-- { + mask := uint64(3) << 62 + for j := 0; j < 32; j++ { + res.CyclotomicSquare(&res).CyclotomicSquare(&res) + b1 := (k1[i] & mask) >> (62 - 2*j) + b2 := (k2[i] & mask) >> (62 - 2*j) + if b1|b2 != 0 { + s := (b2<<2 | b1) + res.Mul(&res, &table[s-1]) + } + mask = mask >> 2 + } + } + + p.Set(&res) + return p +} + // InverseUnitary inverse a unitary element func (z *E12) InverseUnitary(x *E12) *E12 { return z.Conjugate(x) From 095417b3a4969dbdcc00c8112b8259bd8db5baa7 Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Mon, 24 Jan 2022 16:29:52 +0100 Subject: [PATCH 04/16] feat(GT, bls12-377): bucket-list MSM --- ecc/bls12-377/internal/fptower/multiexp.go | 1229 ++++++++++++++++++++ ecc/bls12-377/pairing_test.go | 143 +++ 2 files changed, 1372 insertions(+) create mode 100644 ecc/bls12-377/internal/fptower/multiexp.go diff --git a/ecc/bls12-377/internal/fptower/multiexp.go b/ecc/bls12-377/internal/fptower/multiexp.go new file mode 100644 index 0000000000..37ef35ea04 --- /dev/null +++ b/ecc/bls12-377/internal/fptower/multiexp.go @@ -0,0 +1,1229 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fptower + +import ( + "errors" + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" + "github.com/consensys/gnark-crypto/internal/parallel" + "math" + "runtime" +) + +/* Multi-Exponentiation à la Pippenger */ + +// selector stores the index, mask and shifts needed to select bits from a scalar +// it is used during the multiExp algorithm or the batch scalar multiplication +type selector struct { + index uint64 // index in the multi-word scalar to select bits from + mask uint64 // mask (c-bit wide) + shift uint64 // shift needed to get our bits on low positions + + multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) + maskHigh uint64 // same than mask, for index+1 + shiftHigh uint64 // same than shift, for index+1 +} + +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { + toReturn := make([]fr.Element, len(scalars)) + + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) + } + selectors[chunk] = d + } + + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) + + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int + + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.IsUint64() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } + + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] + + // init with carry if any + digit := carry + carry = 0 + + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + + // if digit is zero, no impact on result + if digit == 0 { + continue + } + + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } + + var bits uint64 + if digit >= 0 { + bits = uint64(digit) + } else { + bits = uint64(-digit-1) | msbWindow + } + + toReturn[i][s.index] |= (bits << s.shift) + if s.multiWordSelect { + toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues +} + +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +func (p *E12) MultiExp(points []E12, scalars []fr.Element, config ecc.MultiExpConfig) (*E12, error) { + // note: + // each of the MsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each MsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented MsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerE12 , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]E12, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerE12(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerE12(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.Mul(p, &_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerE12(p *E12, c int, points []E12, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.MsmC4(points, scalars, splitFirstChunk) + + case 5: + p.MsmC5(points, scalars, splitFirstChunk) + + case 6: + p.MsmC6(points, scalars, splitFirstChunk) + + case 7: + p.MsmC7(points, scalars, splitFirstChunk) + + case 8: + p.MsmC8(points, scalars, splitFirstChunk) + + case 9: + p.MsmC9(points, scalars, splitFirstChunk) + + case 10: + p.MsmC10(points, scalars, splitFirstChunk) + + case 11: + p.MsmC11(points, scalars, splitFirstChunk) + + case 12: + p.MsmC12(points, scalars, splitFirstChunk) + + case 13: + p.MsmC13(points, scalars, splitFirstChunk) + + case 14: + p.MsmC14(points, scalars, splitFirstChunk) + + case 15: + p.MsmC15(points, scalars, splitFirstChunk) + + case 16: + p.MsmC16(points, scalars, splitFirstChunk) + + case 20: + p.MsmC20(points, scalars, splitFirstChunk) + + case 21: + p.MsmC21(points, scalars, splitFirstChunk) + + case 22: + p.MsmC22(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkE12 reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkE12(p *E12, c int, chChunks []chan E12) *E12 { + var _p E12 + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.CyclotomicSquare(&_p) + } + totalj := <-chChunks[j] + _p.Mul(&_p, &totalj) + } + + p.Set(&_p) + return p +} + +func msmProcessChunkE12(chunk uint64, + chRes chan<- E12, + buckets []E12, + c uint64, + points []E12, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].SetOne() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + var tmp E12 + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].Mul(&buckets[bits-1], &points[i]) + } else { + // sub + tmp.Conjugate(&points[i]) + buckets[bits & ^msbWindow].Mul(&buckets[bits & ^msbWindow], &tmp) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var one, runningSum, total E12 + runningSum.SetOne() + total.SetOne() + one.SetOne() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].Equal(&one) { + runningSum.Mul(&runningSum, &buckets[k]) + } + total.Mul(&total, &runningSum) + } + + chRes <- total + +} + +func (p *E12) MsmC4(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 4 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC5(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 5 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC6(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 6 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC7(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 7 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC8(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 8 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC9(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 9 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC10(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 10 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC11(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 11 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC12(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 12 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC13(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 13 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC14(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 14 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC15(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 15 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC16(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC20(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 20 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC21(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 21 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC22(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 22 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} diff --git a/ecc/bls12-377/pairing_test.go b/ecc/bls12-377/pairing_test.go index 61af6eec91..7c7cf3be6c 100644 --- a/ecc/bls12-377/pairing_test.go +++ b/ecc/bls12-377/pairing_test.go @@ -19,8 +19,11 @@ package bls12377 import ( "fmt" "math/big" + "math/bits" + "runtime" "testing" + "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-377/fp" "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" "github.com/leanovate/gopter" @@ -232,6 +235,112 @@ func TestPairing(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestMultiExpGT(t *testing.T) { + + parameters := gopter.DefaultTestParameters() + parameters.MinSuccessfulTests = 2 + + properties := gopter.NewProperties(parameters) + + genScalar := GenFr() + + // size of the multiExps + const nbSamples = 143 + + // multi exp points + var samplePoints [nbSamples]GT + var _g, g GT + _g.SetRandom() + + // put into GT + _g = FinalExponentiation(&_g) + + g.Set(&_g) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].Set(&g) + g.Mul(&g, &_g) + } + + // final scalar to use in double and add method (without mixer factor) + // n(n+1)(2n+1)/6 (sum of the squares from 1 to n) + var scalar big.Int + scalar.SetInt64(nbSamples) + scalar.Mul(&scalar, new(big.Int).SetInt64(nbSamples+1)) + scalar.Mul(&scalar, new(big.Int).SetInt64(2*nbSamples+1)) + scalar.Div(&scalar, new(big.Int).SetInt64(6)) + + // ensure a multiexp that's splitted has the same result as a non-splitted one.. + properties.Property("[GT] Multi exponentation (c=16) should be consistant with splitted multiexp", prop.ForAll( + func(mixer fr.Element) bool { + var samplePointsLarge [nbSamples * 13]GT + for i := 0; i < 13; i++ { + copy(samplePointsLarge[i*nbSamples:], samplePoints[:]) + } + + var r16, splitted1, splitted2 GT + + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples * 13]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) + r16.MsmC16(samplePoints[:], scalars16, true) + + splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) + splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) + return r16.Equal(&splitted1) && r16.Equal(&splitted2) + }, + genScalar, + )) + + // note : this test is here as we expect to have a different multiExp than the above bucket method + // for small number of points + properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll( + func(mixer fr.Element) bool { + + var _g, g GT + _g.SetRandom() + + // put into GT + _g = FinalExponentiation(&_g) + + g.Set(&_g) + + // mixer ensures that all the words of a fpElement are set + samplePoints := make([]GT, 30) + sampleScalars := make([]fr.Element, 30) + + for i := 1; i <= 30; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + samplePoints[i-1].Set(&g) + g.Mul(&g, &_g) + } + + var op1MultiExp GT + op1MultiExp.MultiExp(samplePoints, sampleScalars, ecc.MultiExpConfig{}) + + var finalBigScalar fr.Element + var finalBigScalarBi big.Int + var op1ScalarMul GT + finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer) + finalBigScalar.ToBigIntRegular(&finalBigScalarBi) + op1ScalarMul.ExpGLV(&_g, &finalBigScalarBi) + + return op1ScalarMul.Equal(&op1MultiExp) + }, + genScalar, + )) + + properties.TestingRun(t, gopter.ConsoleReporter(false)) +} + // ------------------------------------------------------------ // benches @@ -362,3 +471,37 @@ func BenchmarkExpGT(b *testing.B) { } }) } + +func BenchmarkMultiExpGT(b *testing.B) { + // ensure every words of the scalars are filled + var mixer fr.Element + mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") + + const pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits + const nbSamples = 1 << pow + + var samplePoints [nbSamples]GT + var sampleScalars [nbSamples]fr.Element + var _g GT + _g.SetRandom() + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + samplePoints[i-1].Set(&_g) + } + + var testPoint GT + + for i := 5; i <= pow; i++ { + using := 1 << i + + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) + } +} From f993ed9a513120437d339bacce87d1f38490f942 Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Tue, 22 Mar 2022 17:57:28 +0100 Subject: [PATCH 05/16] fix: parameters of ExpGLV for the new curves --- ecc/bls12-378/internal/fptower/e12.go | 114 ++++++++++++++++++- ecc/bls12-378/internal/fptower/e12_test.go | 25 ++++ ecc/bls12-378/internal/fptower/parameters.go | 33 ++++++ ecc/bls12-378/pairing_test.go | 59 ++++++++++ ecc/bw6-633/internal/fptower/parameters.go | 2 +- ecc/bw6-756/internal/fptower/e6.go | 114 ++++++++++++++++++- ecc/bw6-756/internal/fptower/parameters.go | 33 ++++++ ecc/bw6-756/pairing_test.go | 59 ++++++++++ ecc/bw6-761/internal/fptower/e3.go | 4 +- ecc/bw6-761/internal/fptower/parameters.go | 2 +- 10 files changed, 429 insertions(+), 16 deletions(-) create mode 100644 ecc/bls12-378/internal/fptower/parameters.go create mode 100644 ecc/bw6-756/internal/fptower/parameters.go diff --git a/ecc/bls12-378/internal/fptower/e12.go b/ecc/bls12-378/internal/fptower/e12.go index 07c716fbe4..6f7da023cd 100644 --- a/ecc/bls12-378/internal/fptower/e12.go +++ b/ecc/bls12-378/internal/fptower/e12.go @@ -19,7 +19,9 @@ package fptower import ( "encoding/binary" "errors" + "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-378/fp" + "github.com/consensys/gnark-crypto/ecc/bls12-378/fr" "math/big" ) @@ -368,25 +370,125 @@ func (z *E12) Inverse(x *E12) *E12 { } // Exp sets z=x**e and returns it +// uses 2-bits windowed method func (z *E12) Exp(x *E12, e big.Int) *E12 { + var res E12 + var ops [3]E12 + res.SetOne() + ops[0].Set(x) + ops[1].Square(&ops[0]) + ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) + b := e.Bytes() for i := range b { w := b[i] - mask := byte(0x80) - for j := 7; j >= 0; j-- { - res.Square(&res) - if (w&mask)>>j != 0 { - res.Mul(&res, x) + mask := byte(0xc0) + for j := 0; j < 4; j++ { + res.Square(&res).Square(&res) + c := (w & mask) >> (6 - 2*j) + if c != 0 { + res.Mul(&res, &ops[c-1]) } - mask = mask >> 1 + mask = mask >> 2 + } + } + z.Set(&res) + + return z +} + +// CyclotomicExp sets z=x**e and returns it +// uses 2-NAF decomposition +// x must be in the cyclotomic subgroup +// TODO: use a windowed method +func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { + var res, xInv E12 + xInv.InverseUnitary(x) + res.SetOne() + eNAF := make([]int8, e.BitLen()+3) + n := ecc.NafDecomposition(&e, eNAF[:]) + for i := n - 1; i >= 0; i-- { + res.CyclotomicSquare(&res) + if eNAF[i] == 1 { + res.Mul(&res, x) + } else if eNAF[i] == -1 { + res.Mul(&res, &xInv) } } z.Set(&res) return z } +// ExpGLV sets z=x**e and returns it +// uses 2-dimensional GLV with 2-bits windowed method +// x must be in GT +// TODO: use 2-NAF +// TODO: use higher dimensional decomposition +func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 { + + var table [15]E12 + var res E12 + var k1, k2 fr.Element + + res.SetOne() + + // table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a + table[0].Set(a) + table[3].Frobenius(a) + + // split the scalar, modifies +-x, Frob(x) accordingly + k := ecc.SplitScalar(s, &glvBasis) + + if k[0].Sign() == -1 { + k[0].Neg(&k[0]) + table[0].InverseUnitary(&table[0]) + } + if k[1].Sign() == -1 { + k[1].Neg(&k[1]) + table[3].InverseUnitary(&table[3]) + } + + // precompute table (2 bits sliding window) + // table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0 + table[1].CyclotomicSquare(&table[0]) + table[2].Mul(&table[1], &table[0]) + table[4].Mul(&table[3], &table[0]) + table[5].Mul(&table[3], &table[1]) + table[6].Mul(&table[3], &table[2]) + table[7].CyclotomicSquare(&table[3]) + table[8].Mul(&table[7], &table[0]) + table[9].Mul(&table[7], &table[1]) + table[10].Mul(&table[7], &table[2]) + table[11].Mul(&table[7], &table[3]) + table[12].Mul(&table[11], &table[0]) + table[13].Mul(&table[11], &table[1]) + table[14].Mul(&table[11], &table[2]) + + // bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max + k1.SetBigInt(&k[0]).FromMont() + k2.SetBigInt(&k[1]).FromMont() + + // loop starts from len(k1)/2 due to the bounds + for i := len(k1) / 2; i >= 0; i-- { + mask := uint64(3) << 62 + for j := 0; j < 32; j++ { + res.CyclotomicSquare(&res).CyclotomicSquare(&res) + b1 := (k1[i] & mask) >> (62 - 2*j) + b2 := (k2[i] & mask) >> (62 - 2*j) + if b1|b2 != 0 { + s := (b2<<2 | b1) + res.Mul(&res, &table[s-1]) + } + mask = mask >> 2 + } + } + + p.Set(&res) + return p +} + // InverseUnitary inverse a unitary element func (z *E12) InverseUnitary(x *E12) *E12 { return z.Conjugate(x) diff --git a/ecc/bls12-378/internal/fptower/e12_test.go b/ecc/bls12-378/internal/fptower/e12_test.go index 6f5a6ee749..1830c3fadd 100644 --- a/ecc/bls12-378/internal/fptower/e12_test.go +++ b/ecc/bls12-378/internal/fptower/e12_test.go @@ -17,6 +17,7 @@ package fptower import ( + "math/big" "testing" "github.com/consensys/gnark-crypto/ecc/bls12-378/fp" @@ -192,6 +193,7 @@ func TestE12Ops(t *testing.T) { genA := GenE12() genB := GenE12() + genExp := GenFp() properties.Property("[BLS12-378] sub & add should leave an element invariant", prop.ForAll( func(a, b *E12) bool { @@ -349,6 +351,29 @@ func TestE12Ops(t *testing.T) { genA, )) + properties.Property("[BLS12-378] Exp and CyclotomicExp results must be the same in the cyclotomic subgroup", prop.ForAll( + func(a *E12, e fp.Element) bool { + var b, c, d E12 + // put in the cyclo subgroup + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.FrobeniusSquare(&b).Mul(a, &b) + + var _e big.Int + k := new(big.Int).SetUint64(12) + e.Exp(e, k) + e.ToBigIntRegular(&_e) + + c.Exp(a, _e) + d.CyclotomicExp(a, _e) + + return c.Equal(&d) + }, + genA, + genExp, + )) + properties.Property("[BLS12-378] Frobenius of x in E12 should be equal to x^q", prop.ForAll( func(a *E12) bool { var b, c E12 diff --git a/ecc/bls12-378/internal/fptower/parameters.go b/ecc/bls12-378/internal/fptower/parameters.go new file mode 100644 index 0000000000..7d5ea1a4c3 --- /dev/null +++ b/ecc/bls12-378/internal/fptower/parameters.go @@ -0,0 +1,33 @@ +// Copyright 2020 ConsenSys AG +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fptower + +import ( + "math/big" + + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bls12-378/fr" +) + +// generator of the curve +var xGen big.Int + +var glvBasis ecc.Lattice + +func init() { + xGen.SetString("11045256207009841153", 10) + _r := fr.Modulus() + ecc.PrecomputeLattice(_r, &xGen, &glvBasis) +} diff --git a/ecc/bls12-378/pairing_test.go b/ecc/bls12-378/pairing_test.go index 5dd1db4415..89c3bafc3e 100644 --- a/ecc/bls12-378/pairing_test.go +++ b/ecc/bls12-378/pairing_test.go @@ -21,6 +21,7 @@ import ( "math/big" "testing" + "github.com/consensys/gnark-crypto/ecc/bls12-378/fp" "github.com/consensys/gnark-crypto/ecc/bls12-378/fr" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" @@ -39,6 +40,7 @@ func TestPairing(t *testing.T) { genA := GenE12() genR1 := GenFr() genR2 := GenFr() + genP := GenFp() properties.Property("[BLS12-378] Having the receiver as operand (final expo) should output the same result", prop.ForAll( func(a GT) bool { @@ -58,6 +60,27 @@ func TestPairing(t *testing.T) { genA, )) + properties.Property("[BLS12-378] Exp, CyclotomicExp and ExpGLV results must be the same in GT", prop.ForAll( + func(a GT, e fp.Element) bool { + a = FinalExponentiation(&a) + + var _e big.Int + + k := new(big.Int).SetUint64(12) + e.Exp(e, k) + e.ToBigIntRegular(&_e) + + var b, c, d GT + b.Exp(&a, _e) + c.ExpGLV(&a, &_e) + d.CyclotomicExp(&a, _e) + + return b.Equal(&c) && c.Equal(&d) + }, + genA, + genP, + )) + properties.Property("[BLS12-378] Expt(Expt) and Exp(t^2) should output the same result in the cyclotomic subgroup", prop.ForAll( func(a GT) bool { var b, c, d GT @@ -303,3 +326,39 @@ func BenchmarkMultiPair(b *testing.B) { }) } } + +func BenchmarkExpGT(b *testing.B) { + + var a GT + a.SetRandom() + a = FinalExponentiation(&a) + + var e fp.Element + e.SetRandom() + + k := new(big.Int).SetUint64(12) + e.Exp(e, k) + var _e big.Int + e.ToBigIntRegular(&_e) + + b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.Exp(&a, _e) + } + }) + + b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.CyclotomicExp(&a, _e) + } + }) + + b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.ExpGLV(&a, &_e) + } + }) +} diff --git a/ecc/bw6-633/internal/fptower/parameters.go b/ecc/bw6-633/internal/fptower/parameters.go index a929cac1f1..308498b0c6 100644 --- a/ecc/bw6-633/internal/fptower/parameters.go +++ b/ecc/bw6-633/internal/fptower/parameters.go @@ -21,7 +21,7 @@ import ( "github.com/consensys/gnark-crypto/ecc/bw6-633/fr" ) -// generator of the curve +// t-1 var xGen big.Int var glvBasis ecc.Lattice diff --git a/ecc/bw6-756/internal/fptower/e6.go b/ecc/bw6-756/internal/fptower/e6.go index 7a794fb0cc..76e9f7b1dd 100644 --- a/ecc/bw6-756/internal/fptower/e6.go +++ b/ecc/bw6-756/internal/fptower/e6.go @@ -20,6 +20,7 @@ import ( "errors" "math/big" + "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-756/fp" "github.com/consensys/gnark-crypto/ecc/bw6-756/fr" ) @@ -311,25 +312,125 @@ func (z *E6) Inverse(x *E6) *E6 { } // Exp sets z=x**e and returns it +// uses 2-bits windowed method func (z *E6) Exp(x *E6, e big.Int) *E6 { + var res E6 + var ops [3]E6 + res.SetOne() + ops[0].Set(x) + ops[1].Square(&ops[0]) + ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) + b := e.Bytes() for i := range b { w := b[i] - mask := byte(0x80) - for j := 7; j >= 0; j-- { - res.Square(&res) - if (w&mask)>>j != 0 { - res.Mul(&res, x) + mask := byte(0xc0) + for j := 0; j < 4; j++ { + res.Square(&res).Square(&res) + c := (w & mask) >> (6 - 2*j) + if c != 0 { + res.Mul(&res, &ops[c-1]) } - mask = mask >> 1 + mask = mask >> 2 + } + } + z.Set(&res) + + return z +} + +// CyclotomicExp sets z=x**e and returns it +// uses 2-NAF decomposition +// x must be in the cyclotomic subgroup +// TODO: use a windowed method +func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 { + var res, xInv E6 + xInv.InverseUnitary(x) + res.SetOne() + eNAF := make([]int8, e.BitLen()+3) + n := ecc.NafDecomposition(&e, eNAF[:]) + for i := n - 1; i >= 0; i-- { + res.CyclotomicSquare(&res) + if eNAF[i] == 1 { + res.Mul(&res, x) + } else if eNAF[i] == -1 { + res.Mul(&res, &xInv) } } z.Set(&res) return z } +// ExpGLV sets z=x**e and returns it +// uses 2-dimensional GLV with 2-bits windowed method +// x must be in GT +// TODO: use 2-NAF +// TODO: use higher dimensional decomposition +func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 { + + var table [15]E6 + var res E6 + var k1, k2 fr.Element + + res.SetOne() + + // table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a + table[0].Set(a) + table[3].Frobenius(a) + + // split the scalar, modifies +-x, Frob(x) accordingly + k := ecc.SplitScalar(s, &glvBasis) + + if k[0].Sign() == -1 { + k[0].Neg(&k[0]) + table[0].InverseUnitary(&table[0]) + } + if k[1].Sign() == -1 { + k[1].Neg(&k[1]) + table[3].InverseUnitary(&table[3]) + } + + // precompute table (2 bits sliding window) + // table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0 + table[1].CyclotomicSquare(&table[0]) + table[2].Mul(&table[1], &table[0]) + table[4].Mul(&table[3], &table[0]) + table[5].Mul(&table[3], &table[1]) + table[6].Mul(&table[3], &table[2]) + table[7].CyclotomicSquare(&table[3]) + table[8].Mul(&table[7], &table[0]) + table[9].Mul(&table[7], &table[1]) + table[10].Mul(&table[7], &table[2]) + table[11].Mul(&table[7], &table[3]) + table[12].Mul(&table[11], &table[0]) + table[13].Mul(&table[11], &table[1]) + table[14].Mul(&table[11], &table[2]) + + // bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max + k1.SetBigInt(&k[0]).FromMont() + k2.SetBigInt(&k[1]).FromMont() + + // loop starts from len(k1)/2 due to the bounds + for i := len(k1) / 2; i >= 0; i-- { + mask := uint64(3) << 62 + for j := 0; j < 32; j++ { + res.CyclotomicSquare(&res).CyclotomicSquare(&res) + b1 := (k1[i] & mask) >> (62 - 2*j) + b2 := (k2[i] & mask) >> (62 - 2*j) + if b1|b2 != 0 { + s := (b2<<2 | b1) + res.Mul(&res, &table[s-1]) + } + mask = mask >> 2 + } + } + + p.Set(&res) + return p +} + // InverseUnitary inverse a unitary element func (z *E6) InverseUnitary(x *E6) *E6 { return z.Conjugate(x) @@ -404,6 +505,7 @@ func (z *E6) SetBytes(e []byte) error { } // IsInSubGroup ensures GT/E6 is in correct sugroup +// TODO: optimize func (z *E6) IsInSubGroup() bool { var one, _z E6 one.SetOne() diff --git a/ecc/bw6-756/internal/fptower/parameters.go b/ecc/bw6-756/internal/fptower/parameters.go new file mode 100644 index 0000000000..8a8ce6f783 --- /dev/null +++ b/ecc/bw6-756/internal/fptower/parameters.go @@ -0,0 +1,33 @@ +// Copyright 2020 ConsenSys AG +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fptower + +import ( + "math/big" + + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bw6-756/fr" +) + +// t-1 +var xGen big.Int + +var glvBasis ecc.Lattice + +func init() { + xGen.SetString("164391353554439166353793911729193406645071739502673898176639736370075683438438023898983435337730", 10) + _r := fr.Modulus() + ecc.PrecomputeLattice(_r, &xGen, &glvBasis) +} diff --git a/ecc/bw6-756/pairing_test.go b/ecc/bw6-756/pairing_test.go index 7db065814e..21c46affe0 100644 --- a/ecc/bw6-756/pairing_test.go +++ b/ecc/bw6-756/pairing_test.go @@ -21,6 +21,7 @@ import ( "math/big" "testing" + "github.com/consensys/gnark-crypto/ecc/bw6-756/fp" "github.com/consensys/gnark-crypto/ecc/bw6-756/fr" "github.com/leanovate/gopter" "github.com/leanovate/gopter/prop" @@ -40,6 +41,7 @@ func TestPairing(t *testing.T) { genR1 := GenFr() genR2 := GenFr() + genP := GenFp() properties.Property("[BW6-756] Having the receiver as operand (final expo) should output the same result", prop.ForAll( func(a GT) bool { @@ -59,6 +61,27 @@ func TestPairing(t *testing.T) { genA, )) + properties.Property("[BW6-756] Exp, CyclotomicExp and ExpGLV results must be the same in GT", prop.ForAll( + func(a GT, e fp.Element) bool { + a = FinalExponentiation(&a) + + var _e big.Int + + k := new(big.Int).SetUint64(12) + e.Exp(e, k) + e.ToBigIntRegular(&_e) + + var b, c, d GT + b.Exp(&a, _e) + c.ExpGLV(&a, &_e) + d.CyclotomicExp(&a, _e) + + return b.Equal(&c) && c.Equal(&d) + }, + genA, + genP, + )) + properties.Property("[BW6-756] Expt(Expt) and Exp(t^2) should output the same result in the cyclotomic subgroup", prop.ForAll( func(a GT) bool { var b, c, d GT @@ -304,3 +327,39 @@ func BenchmarkMultiPair(b *testing.B) { }) } } + +func BenchmarkExpGT(b *testing.B) { + + var a GT + a.SetRandom() + a = FinalExponentiation(&a) + + var e fp.Element + e.SetRandom() + + k := new(big.Int).SetUint64(12) + e.Exp(e, k) + var _e big.Int + e.ToBigIntRegular(&_e) + + b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.Exp(&a, _e) + } + }) + + b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.CyclotomicExp(&a, _e) + } + }) + + b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + a.ExpGLV(&a, &_e) + } + }) +} diff --git a/ecc/bw6-761/internal/fptower/e3.go b/ecc/bw6-761/internal/fptower/e3.go index 29990fe6bb..37eb4e2c67 100644 --- a/ecc/bw6-761/internal/fptower/e3.go +++ b/ecc/bw6-761/internal/fptower/e3.go @@ -16,7 +16,7 @@ import ( "github.com/consensys/gnark-crypto/ecc/bw6-761/fp" ) -// E3 is a degree-three finite field extension of fp2 +// E3 is a degree-three finite field extension of fp3 type E3 struct { A0, A1, A2 fp.Element } @@ -27,7 +27,7 @@ func (z *E3) Equal(x *E3) bool { return z.A0.Equal(&x.A0) && z.A1.Equal(&x.A1) && z.A2.Equal(&x.A2) } -// SetString sets a E3 elmt from stringf +// SetString sets a E3 elmt from string func (z *E3) SetString(s1, s2, s3 string) *E3 { z.A0.SetString(s1) z.A1.SetString(s2) diff --git a/ecc/bw6-761/internal/fptower/parameters.go b/ecc/bw6-761/internal/fptower/parameters.go index 2ec4ef2b19..8990cd62ea 100644 --- a/ecc/bw6-761/internal/fptower/parameters.go +++ b/ecc/bw6-761/internal/fptower/parameters.go @@ -21,7 +21,7 @@ import ( "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" ) -// generator of the curve +// t-1 var xGen big.Int var glvBasis ecc.Lattice From 61d93f7804ffc6c9825050bd6fdb61d5f98ec6ea Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Wed, 23 Mar 2022 15:14:59 +0100 Subject: [PATCH 06/16] style: unnecessary use of fmt.Sprintf --- ecc/bls12-377/pairing_test.go | 6 +++--- ecc/bls12-378/pairing_test.go | 6 +++--- ecc/bls12-381/pairing_test.go | 6 +++--- ecc/bls24-315/pairing_test.go | 6 +++--- ecc/bn254/pairing_test.go | 6 +++--- ecc/bw6-633/pairing_test.go | 6 +++--- ecc/bw6-756/pairing_test.go | 6 +++--- ecc/bw6-761/pairing_test.go | 6 +++--- internal/generator/pairing/template/tests/pairing.go.tmpl | 6 +++--- 9 files changed, 27 insertions(+), 27 deletions(-) diff --git a/ecc/bls12-377/pairing_test.go b/ecc/bls12-377/pairing_test.go index 61af6eec91..84df59ef04 100644 --- a/ecc/bls12-377/pairing_test.go +++ b/ecc/bls12-377/pairing_test.go @@ -341,21 +341,21 @@ func BenchmarkExpGT(b *testing.B) { var _e big.Int e.ToBigIntRegular(&_e) - b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) { + b.Run("Naive windowed Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.Exp(&a, _e) } }) - b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) { + b.Run("2-NAF cyclotomic Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.CyclotomicExp(&a, _e) } }) - b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) { + b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.ExpGLV(&a, &_e) diff --git a/ecc/bls12-378/pairing_test.go b/ecc/bls12-378/pairing_test.go index 89c3bafc3e..4567591fd7 100644 --- a/ecc/bls12-378/pairing_test.go +++ b/ecc/bls12-378/pairing_test.go @@ -341,21 +341,21 @@ func BenchmarkExpGT(b *testing.B) { var _e big.Int e.ToBigIntRegular(&_e) - b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) { + b.Run("Naive windowed Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.Exp(&a, _e) } }) - b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) { + b.Run("2-NAF cyclotomic Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.CyclotomicExp(&a, _e) } }) - b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) { + b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.ExpGLV(&a, &_e) diff --git a/ecc/bls12-381/pairing_test.go b/ecc/bls12-381/pairing_test.go index c45923175d..ddd4eb0099 100644 --- a/ecc/bls12-381/pairing_test.go +++ b/ecc/bls12-381/pairing_test.go @@ -341,21 +341,21 @@ func BenchmarkExpGT(b *testing.B) { var _e big.Int e.ToBigIntRegular(&_e) - b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) { + b.Run("Naive windowed Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.Exp(&a, _e) } }) - b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) { + b.Run("2-NAF cyclotomic Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.CyclotomicExp(&a, _e) } }) - b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) { + b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.ExpGLV(&a, &_e) diff --git a/ecc/bls24-315/pairing_test.go b/ecc/bls24-315/pairing_test.go index 7f2b9e04df..df0fa0fa87 100644 --- a/ecc/bls24-315/pairing_test.go +++ b/ecc/bls24-315/pairing_test.go @@ -344,21 +344,21 @@ func BenchmarkExpGT(b *testing.B) { var _e big.Int e.ToBigIntRegular(&_e) - b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) { + b.Run("Naive windowed Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.Exp(&a, _e) } }) - b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) { + b.Run("2-NAF cyclotomic Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.CyclotomicExp(&a, _e) } }) - b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) { + b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.ExpGLV(&a, &_e) diff --git a/ecc/bn254/pairing_test.go b/ecc/bn254/pairing_test.go index a875603cde..01f6db52b9 100644 --- a/ecc/bn254/pairing_test.go +++ b/ecc/bn254/pairing_test.go @@ -341,21 +341,21 @@ func BenchmarkExpGT(b *testing.B) { var _e big.Int e.ToBigIntRegular(&_e) - b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) { + b.Run("Naive windowed Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.Exp(&a, _e) } }) - b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) { + b.Run("2-NAF cyclotomic Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.CyclotomicExp(&a, _e) } }) - b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) { + b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.ExpGLV(&a, &_e) diff --git a/ecc/bw6-633/pairing_test.go b/ecc/bw6-633/pairing_test.go index fef2c615a4..707ecf0272 100644 --- a/ecc/bw6-633/pairing_test.go +++ b/ecc/bw6-633/pairing_test.go @@ -344,21 +344,21 @@ func BenchmarkExpGT(b *testing.B) { var _e big.Int e.ToBigIntRegular(&_e) - b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) { + b.Run("Naive windowed Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.Exp(&a, _e) } }) - b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) { + b.Run("2-NAF cyclotomic Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.CyclotomicExp(&a, _e) } }) - b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) { + b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.ExpGLV(&a, &_e) diff --git a/ecc/bw6-756/pairing_test.go b/ecc/bw6-756/pairing_test.go index 21c46affe0..c586872320 100644 --- a/ecc/bw6-756/pairing_test.go +++ b/ecc/bw6-756/pairing_test.go @@ -342,21 +342,21 @@ func BenchmarkExpGT(b *testing.B) { var _e big.Int e.ToBigIntRegular(&_e) - b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) { + b.Run("Naive windowed Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.Exp(&a, _e) } }) - b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) { + b.Run("2-NAF cyclotomic Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.CyclotomicExp(&a, _e) } }) - b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) { + b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.ExpGLV(&a, &_e) diff --git a/ecc/bw6-761/pairing_test.go b/ecc/bw6-761/pairing_test.go index 14dc68d169..d86c3e9729 100644 --- a/ecc/bw6-761/pairing_test.go +++ b/ecc/bw6-761/pairing_test.go @@ -344,21 +344,21 @@ func BenchmarkExpGT(b *testing.B) { var _e big.Int e.ToBigIntRegular(&_e) - b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) { + b.Run("Naive windowed Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.Exp(&a, _e) } }) - b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) { + b.Run("2-NAF cyclotomic Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.CyclotomicExp(&a, _e) } }) - b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) { + b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.ExpGLV(&a, &_e) diff --git a/internal/generator/pairing/template/tests/pairing.go.tmpl b/internal/generator/pairing/template/tests/pairing.go.tmpl index bf655a4f3f..fe0fe1519e 100644 --- a/internal/generator/pairing/template/tests/pairing.go.tmpl +++ b/internal/generator/pairing/template/tests/pairing.go.tmpl @@ -344,21 +344,21 @@ func BenchmarkExpGT(b *testing.B) { var _e big.Int e.ToBigIntRegular(&_e) - b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) { + b.Run("Naive windowed Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.Exp(&a, _e) } }) - b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) { + b.Run("2-NAF cyclotomic Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.CyclotomicExp(&a, _e) } }) - b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) { + b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { a.ExpGLV(&a, &_e) From 8022ee1e012b2bf123f590cb5910e55ff5e0d862 Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Wed, 23 Mar 2022 16:02:55 +0100 Subject: [PATCH 07/16] test: MSM5 and MSM5 for GT --- ecc/bls12-377/pairing_test.go | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/ecc/bls12-377/pairing_test.go b/ecc/bls12-377/pairing_test.go index 8068bd8a35..ea32df9e11 100644 --- a/ecc/bls12-377/pairing_test.go +++ b/ecc/bls12-377/pairing_test.go @@ -298,6 +298,40 @@ func TestMultiExpGT(t *testing.T) { genScalar, )) + // we test only c = 5 and c = 16 + properties.Property("[GT] Multi exponentation (c=5, c=16) should be consistant with sum of square", prop.ForAll( + func(mixer fr.Element) bool { + + var expected, g GT + g.SetRandom() + // put into GT + g = FinalExponentiation(&_g) + + // compute expected result with double and add + var finalScalar, mixerBigInt big.Int + finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt)) + expected.ExpGLV(&_g, &finalScalar) + + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + scalars5, _ := partitionScalars(sampleScalars[:], 5, false, runtime.NumCPU()) + scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) + + var r5, r16 GT + r5.MsmC5(samplePoints[:], scalars5, false) + r16.MsmC16(samplePoints[:], scalars16, true) + return (r5.Equal(&expected) && r16.Equal(&expected)) + }, + genScalar, + )) + // note : this test is here as we expect to have a different multiExp than the above bucket method // for small number of points properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll( From c657828f9a91c1d35348e3205a9f4b57e90896c0 Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Wed, 23 Mar 2022 16:07:31 +0100 Subject: [PATCH 08/16] feat: GT-MSM for BLS12-378 --- ecc/bls12-378/internal/fptower/multiexp.go | 1229 ++++++++++++++++++++ ecc/bls12-378/pairing_test.go | 176 +++ 2 files changed, 1405 insertions(+) create mode 100644 ecc/bls12-378/internal/fptower/multiexp.go diff --git a/ecc/bls12-378/internal/fptower/multiexp.go b/ecc/bls12-378/internal/fptower/multiexp.go new file mode 100644 index 0000000000..2c1feb87c5 --- /dev/null +++ b/ecc/bls12-378/internal/fptower/multiexp.go @@ -0,0 +1,1229 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fptower + +import ( + "errors" + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bls12-378/fr" + "github.com/consensys/gnark-crypto/internal/parallel" + "math" + "runtime" +) + +/* Multi-Exponentiation à la Pippenger */ + +// selector stores the index, mask and shifts needed to select bits from a scalar +// it is used during the multiExp algorithm or the batch scalar multiplication +type selector struct { + index uint64 // index in the multi-word scalar to select bits from + mask uint64 // mask (c-bit wide) + shift uint64 // shift needed to get our bits on low positions + + multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) + maskHigh uint64 // same than mask, for index+1 + shiftHigh uint64 // same than shift, for index+1 +} + +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { + toReturn := make([]fr.Element, len(scalars)) + + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) + } + selectors[chunk] = d + } + + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) + + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int + + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.IsUint64() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } + + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] + + // init with carry if any + digit := carry + carry = 0 + + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + + // if digit is zero, no impact on result + if digit == 0 { + continue + } + + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } + + var bits uint64 + if digit >= 0 { + bits = uint64(digit) + } else { + bits = uint64(-digit-1) | msbWindow + } + + toReturn[i][s.index] |= (bits << s.shift) + if s.multiWordSelect { + toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues +} + +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +func (p *E12) MultiExp(points []E12, scalars []fr.Element, config ecc.MultiExpConfig) (*E12, error) { + // note: + // each of the MsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each MsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented MsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerE12 , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]E12, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerE12(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerE12(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.Mul(p, &_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerE12(p *E12, c int, points []E12, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.MsmC4(points, scalars, splitFirstChunk) + + case 5: + p.MsmC5(points, scalars, splitFirstChunk) + + case 6: + p.MsmC6(points, scalars, splitFirstChunk) + + case 7: + p.MsmC7(points, scalars, splitFirstChunk) + + case 8: + p.MsmC8(points, scalars, splitFirstChunk) + + case 9: + p.MsmC9(points, scalars, splitFirstChunk) + + case 10: + p.MsmC10(points, scalars, splitFirstChunk) + + case 11: + p.MsmC11(points, scalars, splitFirstChunk) + + case 12: + p.MsmC12(points, scalars, splitFirstChunk) + + case 13: + p.MsmC13(points, scalars, splitFirstChunk) + + case 14: + p.MsmC14(points, scalars, splitFirstChunk) + + case 15: + p.MsmC15(points, scalars, splitFirstChunk) + + case 16: + p.MsmC16(points, scalars, splitFirstChunk) + + case 20: + p.MsmC20(points, scalars, splitFirstChunk) + + case 21: + p.MsmC21(points, scalars, splitFirstChunk) + + case 22: + p.MsmC22(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkE12 reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkE12(p *E12, c int, chChunks []chan E12) *E12 { + var _p E12 + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.CyclotomicSquare(&_p) + } + totalj := <-chChunks[j] + _p.Mul(&_p, &totalj) + } + + p.Set(&_p) + return p +} + +func msmProcessChunkE12(chunk uint64, + chRes chan<- E12, + buckets []E12, + c uint64, + points []E12, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].SetOne() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + var tmp E12 + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].Mul(&buckets[bits-1], &points[i]) + } else { + // sub + tmp.Conjugate(&points[i]) + buckets[bits & ^msbWindow].Mul(&buckets[bits & ^msbWindow], &tmp) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var one, runningSum, total E12 + runningSum.SetOne() + total.SetOne() + one.SetOne() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].Equal(&one) { + runningSum.Mul(&runningSum, &buckets[k]) + } + total.Mul(&total, &runningSum) + } + + chRes <- total + +} + +func (p *E12) MsmC4(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 4 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC5(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 5 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC6(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 6 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC7(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 7 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC8(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 8 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC9(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 9 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC10(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 10 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC11(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 11 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC12(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 12 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC13(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 13 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC14(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 14 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC15(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 15 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC16(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC20(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 20 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC21(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 21 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC22(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 22 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} diff --git a/ecc/bls12-378/pairing_test.go b/ecc/bls12-378/pairing_test.go index 4567591fd7..1517f195cc 100644 --- a/ecc/bls12-378/pairing_test.go +++ b/ecc/bls12-378/pairing_test.go @@ -19,8 +19,11 @@ package bls12378 import ( "fmt" "math/big" + "math/bits" + "runtime" "testing" + "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-378/fp" "github.com/consensys/gnark-crypto/ecc/bls12-378/fr" "github.com/leanovate/gopter" @@ -231,6 +234,145 @@ func TestPairing(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestMultiExpGT(t *testing.T) { + + parameters := gopter.DefaultTestParameters() + parameters.MinSuccessfulTests = 2 + + properties := gopter.NewProperties(parameters) + + genScalar := GenFr() + + // size of the multiExps + const nbSamples = 143 + + // multi exp points + var samplePoints [nbSamples]GT + var _g, g GT + _g.SetRandom() + + // put into GT + _g = FinalExponentiation(&_g) + + g.Set(&_g) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].Set(&g) + g.Mul(&g, &_g) + } + + // final scalar to use in double and add method (without mixer factor) + // n(n+1)(2n+1)/6 (sum of the squares from 1 to n) + var scalar big.Int + scalar.SetInt64(nbSamples) + scalar.Mul(&scalar, new(big.Int).SetInt64(nbSamples+1)) + scalar.Mul(&scalar, new(big.Int).SetInt64(2*nbSamples+1)) + scalar.Div(&scalar, new(big.Int).SetInt64(6)) + + // ensure a multiexp that's splitted has the same result as a non-splitted one.. + properties.Property("[GT] Multi exponentation (c=16) should be consistant with splitted multiexp", prop.ForAll( + func(mixer fr.Element) bool { + var samplePointsLarge [nbSamples * 13]GT + for i := 0; i < 13; i++ { + copy(samplePointsLarge[i*nbSamples:], samplePoints[:]) + } + + var r16, splitted1, splitted2 GT + + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples * 13]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) + r16.MsmC16(samplePoints[:], scalars16, true) + + splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) + splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) + return r16.Equal(&splitted1) && r16.Equal(&splitted2) + }, + genScalar, + )) + + // we test only c = 5 and c = 16 + properties.Property("[GT] Multi exponentation (c=5, c=16) should be consistant with sum of square", prop.ForAll( + func(mixer fr.Element) bool { + + var expected, g GT + g.SetRandom() + // put into GT + g = FinalExponentiation(&_g) + + // compute expected result with double and add + var finalScalar, mixerBigInt big.Int + finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt)) + expected.ExpGLV(&_g, &finalScalar) + + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + scalars5, _ := partitionScalars(sampleScalars[:], 5, false, runtime.NumCPU()) + scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) + + var r5, r16 GT + r5.MsmC5(samplePoints[:], scalars5, false) + r16.MsmC16(samplePoints[:], scalars16, true) + return (r5.Equal(&expected) && r16.Equal(&expected)) + }, + genScalar, + )) + + // note : this test is here as we expect to have a different multiExp than the above bucket method + // for small number of points + properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll( + func(mixer fr.Element) bool { + + var _g, g GT + _g.SetRandom() + + // put into GT + _g = FinalExponentiation(&_g) + + g.Set(&_g) + + // mixer ensures that all the words of a fpElement are set + samplePoints := make([]GT, 30) + sampleScalars := make([]fr.Element, 30) + + for i := 1; i <= 30; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + samplePoints[i-1].Set(&g) + g.Mul(&g, &_g) + } + + var op1MultiExp GT + op1MultiExp.MultiExp(samplePoints, sampleScalars, ecc.MultiExpConfig{}) + + var finalBigScalar fr.Element + var finalBigScalarBi big.Int + var op1ScalarMul GT + finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer) + finalBigScalar.ToBigIntRegular(&finalBigScalarBi) + op1ScalarMul.ExpGLV(&_g, &finalBigScalarBi) + + return op1ScalarMul.Equal(&op1MultiExp) + }, + genScalar, + )) + + properties.TestingRun(t, gopter.ConsoleReporter(false)) +} // ------------------------------------------------------------ // benches @@ -362,3 +504,37 @@ func BenchmarkExpGT(b *testing.B) { } }) } + +func BenchmarkMultiExpGT(b *testing.B) { + // ensure every words of the scalars are filled + var mixer fr.Element + mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") + + const pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits + const nbSamples = 1 << pow + + var samplePoints [nbSamples]GT + var sampleScalars [nbSamples]fr.Element + var _g GT + _g.SetRandom() + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + samplePoints[i-1].Set(&_g) + } + + var testPoint GT + + for i := 5; i <= pow; i++ { + using := 1 << i + + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) + } +} From 6ceeb9083ff6c107fd36ddae86d8245042fef4e8 Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Fri, 17 Jun 2022 17:36:47 +0100 Subject: [PATCH 09/16] fix: add GT-exp to bls24-317 --- ecc/bls24-317/internal/fptower/e24.go | 114 ++++++++++++++++++- ecc/bls24-317/internal/fptower/e24_test.go | 25 ++++ ecc/bls24-317/internal/fptower/parameters.go | 33 ++++++ 3 files changed, 166 insertions(+), 6 deletions(-) create mode 100644 ecc/bls24-317/internal/fptower/parameters.go diff --git a/ecc/bls24-317/internal/fptower/e24.go b/ecc/bls24-317/internal/fptower/e24.go index 9792420ca6..48384aa007 100644 --- a/ecc/bls24-317/internal/fptower/e24.go +++ b/ecc/bls24-317/internal/fptower/e24.go @@ -18,6 +18,8 @@ package fptower import ( "errors" + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bls24-317/fr" "math/big" ) @@ -405,25 +407,125 @@ func BatchInvertE24(a []E24) []E24 { } // Exp sets z=x**e and returns it +// uses 2-bits windowed method func (z *E24) Exp(x *E24, e big.Int) *E24 { + var res E24 + var ops [3]E24 + res.SetOne() + ops[0].Set(x) + ops[1].Square(&ops[0]) + ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) + b := e.Bytes() for i := range b { w := b[i] - mask := byte(0x80) - for j := 7; j >= 0; j-- { - res.Square(&res) - if (w&mask)>>j != 0 { - res.Mul(&res, x) + mask := byte(0xc0) + for j := 0; j < 4; j++ { + res.Square(&res).Square(&res) + c := (w & mask) >> (6 - 2*j) + if c != 0 { + res.Mul(&res, &ops[c-1]) } - mask = mask >> 1 + mask = mask >> 2 + } + } + z.Set(&res) + + return z +} + +// CyclotomicExp sets z=x**e and returns it +// uses 2-NAF decomposition +// x must be in the cyclotomic subgroup +// TODO: use a windowed method +func (z *E24) CyclotomicExp(x *E24, e big.Int) *E24 { + var res, xInv E24 + xInv.InverseUnitary(x) + res.SetOne() + eNAF := make([]int8, e.BitLen()+3) + n := ecc.NafDecomposition(&e, eNAF[:]) + for i := n - 1; i >= 0; i-- { + res.CyclotomicSquare(&res) + if eNAF[i] == 1 { + res.Mul(&res, x) + } else if eNAF[i] == -1 { + res.Mul(&res, &xInv) } } z.Set(&res) return z } +// ExpGLV sets z=x**e and returns it +// uses 2-dimensional GLV with 2-bits windowed method +// x must be in GT +// TODO: use 2-NAF +// TODO: use higher dimensional decomposition +func (p *E24) ExpGLV(a *E24, s *big.Int) *E24 { + + var table [15]E24 + var res E24 + var k1, k2 fr.Element + + res.SetOne() + + // table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a + table[0].Set(a) + table[3].Frobenius(a) + + // split the scalar, modifies +-x, Frob(x) accordingly + k := ecc.SplitScalar(s, &glvBasis) + + if k[0].Sign() == -1 { + k[0].Neg(&k[0]) + table[0].InverseUnitary(&table[0]) + } + if k[1].Sign() == -1 { + k[1].Neg(&k[1]) + table[3].InverseUnitary(&table[3]) + } + + // precompute table (2 bits sliding window) + // table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0 + table[1].CyclotomicSquare(&table[0]) + table[2].Mul(&table[1], &table[0]) + table[4].Mul(&table[3], &table[0]) + table[5].Mul(&table[3], &table[1]) + table[6].Mul(&table[3], &table[2]) + table[7].CyclotomicSquare(&table[3]) + table[8].Mul(&table[7], &table[0]) + table[9].Mul(&table[7], &table[1]) + table[10].Mul(&table[7], &table[2]) + table[11].Mul(&table[7], &table[3]) + table[12].Mul(&table[11], &table[0]) + table[13].Mul(&table[11], &table[1]) + table[14].Mul(&table[11], &table[2]) + + // bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max + k1.SetBigInt(&k[0]).FromMont() + k2.SetBigInt(&k[1]).FromMont() + + // loop starts from len(k1)/2 due to the bounds + for i := len(k1)/2 + 1; i >= 0; i-- { + mask := uint64(3) << 62 + for j := 0; j < 32; j++ { + res.CyclotomicSquare(&res).CyclotomicSquare(&res) + b1 := (k1[i] & mask) >> (62 - 2*j) + b2 := (k2[i] & mask) >> (62 - 2*j) + if b1|b2 != 0 { + s := (b2<<2 | b1) + res.Mul(&res, &table[s-1]) + } + mask = mask >> 2 + } + } + + p.Set(&res) + return p +} + // InverseUnitary inverse a unitary element func (z *E24) InverseUnitary(x *E24) *E24 { return z.Conjugate(x) diff --git a/ecc/bls24-317/internal/fptower/e24_test.go b/ecc/bls24-317/internal/fptower/e24_test.go index b39bf90642..b5c2dec411 100644 --- a/ecc/bls24-317/internal/fptower/e24_test.go +++ b/ecc/bls24-317/internal/fptower/e24_test.go @@ -17,6 +17,7 @@ package fptower import ( + "math/big" "testing" "github.com/consensys/gnark-crypto/ecc/bls24-317/fp" @@ -192,6 +193,7 @@ func TestE24Ops(t *testing.T) { genA := GenE24() genB := GenE24() + genExp := GenFp() properties.Property("[BLS24-317] sub & add should leave an element invariant", prop.ForAll( func(a, b *E24) bool { @@ -406,6 +408,29 @@ func TestE24Ops(t *testing.T) { genA, )) + properties.Property("[BLS24-315] Exp and CyclotomicExp results must be the same in the cyclotomic subgroup", prop.ForAll( + func(a *E24, e fp.Element) bool { + var b, c, d E24 + // put in the cyclo subgroup + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.FrobeniusQuad(&b).Mul(a, &b) + + var _e big.Int + k := new(big.Int).SetUint64(24) + e.Exp(e, k) + e.ToBigIntRegular(&_e) + + c.Exp(a, _e) + d.CyclotomicExp(a, _e) + + return c.Equal(&d) + }, + genA, + genExp, + )) + properties.Property("[BLS24-317] Frobenius of x in E24 should be equal to x^q", prop.ForAll( func(a *E24) bool { var b, c E24 diff --git a/ecc/bls24-317/internal/fptower/parameters.go b/ecc/bls24-317/internal/fptower/parameters.go new file mode 100644 index 0000000000..6d637e8624 --- /dev/null +++ b/ecc/bls24-317/internal/fptower/parameters.go @@ -0,0 +1,33 @@ +// Copyright 2020 ConsenSys AG +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fptower + +import ( + "math/big" + + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bls24-317/fr" +) + +// generator of the curve +var xGen big.Int + +var glvBasis ecc.Lattice + +func init() { + xGen.SetString("3640754176", 10) + _r := fr.Modulus() + ecc.PrecomputeLattice(_r, &xGen, &glvBasis) +} From 2fbe149d279463d116423bdcb93b889cc3588874 Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Fri, 17 Jun 2022 18:14:34 +0100 Subject: [PATCH 10/16] fix: handle negative exponent in Fp12 exp --- ecc/bls12-377/internal/fptower/e12.go | 106 +++++++++++++---- ecc/bls12-377/internal/fptower/e2.go | 21 +++- ecc/bls12-377/pairing_test.go | 4 +- ecc/bls12-378/internal/fptower/e12.go | 106 +++++++++++++---- ecc/bls12-378/internal/fptower/e2.go | 21 +++- ecc/bls12-378/pairing_test.go | 4 +- ecc/bls12-381/internal/fptower/e12.go | 106 +++++++++++++---- ecc/bls12-381/internal/fptower/e2.go | 21 +++- ecc/bls12-381/pairing_test.go | 4 +- ecc/bls24-315/pairing_test.go | 4 +- ecc/bls24-317/pairing_test.go | 4 +- ecc/bn254/internal/fptower/e12.go | 106 +++++++++++++---- ecc/bn254/internal/fptower/e2.go | 21 +++- ecc/bn254/pairing_test.go | 4 +- ecc/bw6-633/pairing_test.go | 4 +- ecc/bw6-756/pairing_test.go | 4 +- ecc/bw6-761/pairing_test.go | 4 +- .../pairing/template/tests/pairing.go.tmpl | 4 +- .../template/fq12over6over2/fq12.go.tmpl | 108 +++++++++++++----- .../tower/template/fq12over6over2/fq2.go.tmpl | 26 ++++- 20 files changed, 523 insertions(+), 159 deletions(-) diff --git a/ecc/bls12-377/internal/fptower/e12.go b/ecc/bls12-377/internal/fptower/e12.go index 1d9325084e..2769e65952 100644 --- a/ecc/bls12-377/internal/fptower/e12.go +++ b/ecc/bls12-377/internal/fptower/e12.go @@ -23,8 +23,15 @@ import ( "github.com/consensys/gnark-crypto/ecc/bls12-377/fp" "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" "math/big" + "sync" ) +var bigIntPool = sync.Pool{ + New: func() interface{} { + return new(big.Int) + }, +} + // E12 is a degree two finite field extension of fp6 type E12 struct { C0, C1 E6 @@ -408,9 +415,25 @@ func BatchInvertE12(a []E12) []E12 { return res } -// Exp sets z=x**e and returns it +// Exp sets z=x**k and returns it // uses 2-bits windowed method -func (z *E12) Exp(x *E12, e big.Int) *E12 { +func (z *E12) Exp(x *E12, k big.Int) *E12 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } var res E12 var ops [3]E12 @@ -438,11 +461,28 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 { return z } -// CyclotomicExp sets z=x**e and returns it +// CyclotomicExp sets z=x**k and returns it // uses 2-NAF decomposition // x must be in the cyclotomic subgroup // TODO: use a windowed method -func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { +func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } + var res, xInv E12 xInv.InverseUnitary(x) res.SetOne() @@ -460,37 +500,53 @@ func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { return z } -// ExpGLV sets z=x**e and returns it +// ExpGLV sets z=x**k and returns it // uses 2-dimensional GLV with 2-bits windowed method // x must be in GT // TODO: use 2-NAF // TODO: use higher dimensional decomposition -func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 { +func (z *E12) ExpGLV(x *E12, k big.Int) *E12 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } var table [15]E12 var res E12 - var k1, k2 fr.Element + var s1, s2 fr.Element res.SetOne() - // table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a - table[0].Set(a) - table[3].Frobenius(a) + // table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x + table[0].Set(x) + table[3].Frobenius(x) // split the scalar, modifies +-x, Frob(x) accordingly - k := ecc.SplitScalar(s, &glvBasis) + s := ecc.SplitScalar(&e, &glvBasis) - if k[0].Sign() == -1 { - k[0].Neg(&k[0]) + if s[0].Sign() == -1 { + s[0].Neg(&s[0]) table[0].InverseUnitary(&table[0]) } - if k[1].Sign() == -1 { - k[1].Neg(&k[1]) + if s[1].Sign() == -1 { + s[1].Neg(&s[1]) table[3].InverseUnitary(&table[3]) } // precompute table (2 bits sliding window) - // table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0 + // table[b3b2b1b0-1] = b3b2*Frobenius(x) + b1b0*x if b3b2b1b0 != 0 table[1].CyclotomicSquare(&table[0]) table[2].Mul(&table[1], &table[0]) table[4].Mul(&table[3], &table[0]) @@ -505,17 +561,17 @@ func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 { table[13].Mul(&table[11], &table[1]) table[14].Mul(&table[11], &table[2]) - // bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max - k1.SetBigInt(&k[0]).FromMont() - k2.SetBigInt(&k[1]).FromMont() + // bounds on the lattice base vectors guarantee that s1, s2 are len(r)/2 bits long max + s1.SetBigInt(&s[0]).FromMont() + s2.SetBigInt(&s[1]).FromMont() - // loop starts from len(k1)/2 due to the bounds - for i := len(k1) / 2; i >= 0; i-- { + // loop starts from len(s1)/2 due to the bounds + for i := len(s1) / 2; i >= 0; i-- { mask := uint64(3) << 62 for j := 0; j < 32; j++ { res.CyclotomicSquare(&res).CyclotomicSquare(&res) - b1 := (k1[i] & mask) >> (62 - 2*j) - b2 := (k2[i] & mask) >> (62 - 2*j) + b1 := (s1[i] & mask) >> (62 - 2*j) + b2 := (s2[i] & mask) >> (62 - 2*j) if b1|b2 != 0 { s := (b2<<2 | b1) res.Mul(&res, &table[s-1]) @@ -524,8 +580,8 @@ func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 { } } - p.Set(&res) - return p + z.Set(&res) + return z } // InverseUnitary inverse a unitary element diff --git a/ecc/bls12-377/internal/fptower/e2.go b/ecc/bls12-377/internal/fptower/e2.go index ee79f4c9d9..32e5ac4bf9 100644 --- a/ecc/bls12-377/internal/fptower/e2.go +++ b/ecc/bls12-377/internal/fptower/e2.go @@ -171,9 +171,26 @@ func (z *E2) Legendre() int { } // Exp sets z=x**e and returns it -func (z *E2) Exp(x E2, exponent *big.Int) *E2 { +func (z *E2) Exp(x E2, e *big.Int) *E2 { + if e.IsUint64() && e.Uint64() == 0 { + return z.SetOne() + } + + k := e + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12) + x.Inverse(&x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + k = bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(k) + k.Neg(k) + } + z.SetOne() - b := exponent.Bytes() + b := e.Bytes() for i := 0; i < len(b); i++ { w := b[i] for j := 0; j < 8; j++ { diff --git a/ecc/bls12-377/pairing_test.go b/ecc/bls12-377/pairing_test.go index be852d0280..e678c58613 100644 --- a/ecc/bls12-377/pairing_test.go +++ b/ecc/bls12-377/pairing_test.go @@ -77,7 +77,7 @@ func TestPairing(t *testing.T) { var b, c, d GT b.Exp(&a, _e) - c.ExpGLV(&a, &_e) + c.ExpGLV(&a, _e) d.CyclotomicExp(&a, _e) return b.Equal(&c) && c.Equal(&d) @@ -407,7 +407,7 @@ func BenchmarkExpGT(b *testing.B) { b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.ExpGLV(&a, &_e) + a.ExpGLV(&a, _e) } }) } diff --git a/ecc/bls12-378/internal/fptower/e12.go b/ecc/bls12-378/internal/fptower/e12.go index 14437de2b3..27f6285fdc 100644 --- a/ecc/bls12-378/internal/fptower/e12.go +++ b/ecc/bls12-378/internal/fptower/e12.go @@ -23,8 +23,15 @@ import ( "github.com/consensys/gnark-crypto/ecc/bls12-378/fp" "github.com/consensys/gnark-crypto/ecc/bls12-378/fr" "math/big" + "sync" ) +var bigIntPool = sync.Pool{ + New: func() interface{} { + return new(big.Int) + }, +} + // E12 is a degree two finite field extension of fp6 type E12 struct { C0, C1 E6 @@ -408,9 +415,25 @@ func BatchInvertE12(a []E12) []E12 { return res } -// Exp sets z=x**e and returns it +// Exp sets z=x**k and returns it // uses 2-bits windowed method -func (z *E12) Exp(x *E12, e big.Int) *E12 { +func (z *E12) Exp(x *E12, k big.Int) *E12 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } var res E12 var ops [3]E12 @@ -438,11 +461,28 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 { return z } -// CyclotomicExp sets z=x**e and returns it +// CyclotomicExp sets z=x**k and returns it // uses 2-NAF decomposition // x must be in the cyclotomic subgroup // TODO: use a windowed method -func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { +func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } + var res, xInv E12 xInv.InverseUnitary(x) res.SetOne() @@ -460,37 +500,53 @@ func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { return z } -// ExpGLV sets z=x**e and returns it +// ExpGLV sets z=x**k and returns it // uses 2-dimensional GLV with 2-bits windowed method // x must be in GT // TODO: use 2-NAF // TODO: use higher dimensional decomposition -func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 { +func (z *E12) ExpGLV(x *E12, k big.Int) *E12 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } var table [15]E12 var res E12 - var k1, k2 fr.Element + var s1, s2 fr.Element res.SetOne() - // table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a - table[0].Set(a) - table[3].Frobenius(a) + // table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x + table[0].Set(x) + table[3].Frobenius(x) // split the scalar, modifies +-x, Frob(x) accordingly - k := ecc.SplitScalar(s, &glvBasis) + s := ecc.SplitScalar(&e, &glvBasis) - if k[0].Sign() == -1 { - k[0].Neg(&k[0]) + if s[0].Sign() == -1 { + s[0].Neg(&s[0]) table[0].InverseUnitary(&table[0]) } - if k[1].Sign() == -1 { - k[1].Neg(&k[1]) + if s[1].Sign() == -1 { + s[1].Neg(&s[1]) table[3].InverseUnitary(&table[3]) } // precompute table (2 bits sliding window) - // table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0 + // table[b3b2b1b0-1] = b3b2*Frobenius(x) + b1b0*x if b3b2b1b0 != 0 table[1].CyclotomicSquare(&table[0]) table[2].Mul(&table[1], &table[0]) table[4].Mul(&table[3], &table[0]) @@ -505,17 +561,17 @@ func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 { table[13].Mul(&table[11], &table[1]) table[14].Mul(&table[11], &table[2]) - // bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max - k1.SetBigInt(&k[0]).FromMont() - k2.SetBigInt(&k[1]).FromMont() + // bounds on the lattice base vectors guarantee that s1, s2 are len(r)/2 bits long max + s1.SetBigInt(&s[0]).FromMont() + s2.SetBigInt(&s[1]).FromMont() - // loop starts from len(k1)/2 due to the bounds - for i := len(k1) / 2; i >= 0; i-- { + // loop starts from len(s1)/2 due to the bounds + for i := len(s1) / 2; i >= 0; i-- { mask := uint64(3) << 62 for j := 0; j < 32; j++ { res.CyclotomicSquare(&res).CyclotomicSquare(&res) - b1 := (k1[i] & mask) >> (62 - 2*j) - b2 := (k2[i] & mask) >> (62 - 2*j) + b1 := (s1[i] & mask) >> (62 - 2*j) + b2 := (s2[i] & mask) >> (62 - 2*j) if b1|b2 != 0 { s := (b2<<2 | b1) res.Mul(&res, &table[s-1]) @@ -524,8 +580,8 @@ func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 { } } - p.Set(&res) - return p + z.Set(&res) + return z } // InverseUnitary inverse a unitary element diff --git a/ecc/bls12-378/internal/fptower/e2.go b/ecc/bls12-378/internal/fptower/e2.go index 4ca5593160..0223d0aac8 100644 --- a/ecc/bls12-378/internal/fptower/e2.go +++ b/ecc/bls12-378/internal/fptower/e2.go @@ -171,9 +171,26 @@ func (z *E2) Legendre() int { } // Exp sets z=x**e and returns it -func (z *E2) Exp(x E2, exponent *big.Int) *E2 { +func (z *E2) Exp(x E2, e *big.Int) *E2 { + if e.IsUint64() && e.Uint64() == 0 { + return z.SetOne() + } + + k := e + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12) + x.Inverse(&x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + k = bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(k) + k.Neg(k) + } + z.SetOne() - b := exponent.Bytes() + b := e.Bytes() for i := 0; i < len(b); i++ { w := b[i] for j := 0; j < 8; j++ { diff --git a/ecc/bls12-378/pairing_test.go b/ecc/bls12-378/pairing_test.go index 4323a4dfac..c0a4e66eae 100644 --- a/ecc/bls12-378/pairing_test.go +++ b/ecc/bls12-378/pairing_test.go @@ -77,7 +77,7 @@ func TestPairing(t *testing.T) { var b, c, d GT b.Exp(&a, _e) - c.ExpGLV(&a, &_e) + c.ExpGLV(&a, _e) d.CyclotomicExp(&a, _e) return b.Equal(&c) && c.Equal(&d) @@ -407,7 +407,7 @@ func BenchmarkExpGT(b *testing.B) { b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.ExpGLV(&a, &_e) + a.ExpGLV(&a, _e) } }) } diff --git a/ecc/bls12-381/internal/fptower/e12.go b/ecc/bls12-381/internal/fptower/e12.go index 2ef60dbd25..7ba4aa36ec 100644 --- a/ecc/bls12-381/internal/fptower/e12.go +++ b/ecc/bls12-381/internal/fptower/e12.go @@ -23,8 +23,15 @@ import ( "github.com/consensys/gnark-crypto/ecc/bls12-381/fp" "github.com/consensys/gnark-crypto/ecc/bls12-381/fr" "math/big" + "sync" ) +var bigIntPool = sync.Pool{ + New: func() interface{} { + return new(big.Int) + }, +} + // E12 is a degree two finite field extension of fp6 type E12 struct { C0, C1 E6 @@ -408,9 +415,25 @@ func BatchInvertE12(a []E12) []E12 { return res } -// Exp sets z=x**e and returns it +// Exp sets z=x**k and returns it // uses 2-bits windowed method -func (z *E12) Exp(x *E12, e big.Int) *E12 { +func (z *E12) Exp(x *E12, k big.Int) *E12 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } var res E12 var ops [3]E12 @@ -438,11 +461,28 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 { return z } -// CyclotomicExp sets z=x**e and returns it +// CyclotomicExp sets z=x**k and returns it // uses 2-NAF decomposition // x must be in the cyclotomic subgroup // TODO: use a windowed method -func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { +func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } + var res, xInv E12 xInv.InverseUnitary(x) res.SetOne() @@ -460,37 +500,53 @@ func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { return z } -// ExpGLV sets z=x**e and returns it +// ExpGLV sets z=x**k and returns it // uses 2-dimensional GLV with 2-bits windowed method // x must be in GT // TODO: use 2-NAF // TODO: use higher dimensional decomposition -func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 { +func (z *E12) ExpGLV(x *E12, k big.Int) *E12 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } var table [15]E12 var res E12 - var k1, k2 fr.Element + var s1, s2 fr.Element res.SetOne() - // table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a - table[0].Set(a) - table[3].Frobenius(a) + // table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x + table[0].Set(x) + table[3].Frobenius(x) // split the scalar, modifies +-x, Frob(x) accordingly - k := ecc.SplitScalar(s, &glvBasis) + s := ecc.SplitScalar(&e, &glvBasis) - if k[0].Sign() == -1 { - k[0].Neg(&k[0]) + if s[0].Sign() == -1 { + s[0].Neg(&s[0]) table[0].InverseUnitary(&table[0]) } - if k[1].Sign() == -1 { - k[1].Neg(&k[1]) + if s[1].Sign() == -1 { + s[1].Neg(&s[1]) table[3].InverseUnitary(&table[3]) } // precompute table (2 bits sliding window) - // table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0 + // table[b3b2b1b0-1] = b3b2*Frobenius(x) + b1b0*x if b3b2b1b0 != 0 table[1].CyclotomicSquare(&table[0]) table[2].Mul(&table[1], &table[0]) table[4].Mul(&table[3], &table[0]) @@ -505,17 +561,17 @@ func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 { table[13].Mul(&table[11], &table[1]) table[14].Mul(&table[11], &table[2]) - // bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max - k1.SetBigInt(&k[0]).FromMont() - k2.SetBigInt(&k[1]).FromMont() + // bounds on the lattice base vectors guarantee that s1, s2 are len(r)/2 bits long max + s1.SetBigInt(&s[0]).FromMont() + s2.SetBigInt(&s[1]).FromMont() - // loop starts from len(k1)/2 due to the bounds - for i := len(k1) / 2; i >= 0; i-- { + // loop starts from len(s1)/2 due to the bounds + for i := len(s1) / 2; i >= 0; i-- { mask := uint64(3) << 62 for j := 0; j < 32; j++ { res.CyclotomicSquare(&res).CyclotomicSquare(&res) - b1 := (k1[i] & mask) >> (62 - 2*j) - b2 := (k2[i] & mask) >> (62 - 2*j) + b1 := (s1[i] & mask) >> (62 - 2*j) + b2 := (s2[i] & mask) >> (62 - 2*j) if b1|b2 != 0 { s := (b2<<2 | b1) res.Mul(&res, &table[s-1]) @@ -524,8 +580,8 @@ func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 { } } - p.Set(&res) - return p + z.Set(&res) + return z } // InverseUnitary inverse a unitary element diff --git a/ecc/bls12-381/internal/fptower/e2.go b/ecc/bls12-381/internal/fptower/e2.go index 20d6479a4b..a553e7fba5 100644 --- a/ecc/bls12-381/internal/fptower/e2.go +++ b/ecc/bls12-381/internal/fptower/e2.go @@ -171,9 +171,26 @@ func (z *E2) Legendre() int { } // Exp sets z=x**e and returns it -func (z *E2) Exp(x E2, exponent *big.Int) *E2 { +func (z *E2) Exp(x E2, e *big.Int) *E2 { + if e.IsUint64() && e.Uint64() == 0 { + return z.SetOne() + } + + k := e + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12) + x.Inverse(&x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + k = bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(k) + k.Neg(k) + } + z.SetOne() - b := exponent.Bytes() + b := e.Bytes() for i := 0; i < len(b); i++ { w := b[i] for j := 0; j < 8; j++ { diff --git a/ecc/bls12-381/pairing_test.go b/ecc/bls12-381/pairing_test.go index 003f841d31..4e7d4e4d7c 100644 --- a/ecc/bls12-381/pairing_test.go +++ b/ecc/bls12-381/pairing_test.go @@ -77,7 +77,7 @@ func TestPairing(t *testing.T) { var b, c, d GT b.Exp(&a, _e) - c.ExpGLV(&a, &_e) + c.ExpGLV(&a, _e) d.CyclotomicExp(&a, _e) return b.Equal(&c) && c.Equal(&d) @@ -407,7 +407,7 @@ func BenchmarkExpGT(b *testing.B) { b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.ExpGLV(&a, &_e) + a.ExpGLV(&a, _e) } }) } diff --git a/ecc/bls24-315/pairing_test.go b/ecc/bls24-315/pairing_test.go index 5100bca1b9..bc1d1383dc 100644 --- a/ecc/bls24-315/pairing_test.go +++ b/ecc/bls24-315/pairing_test.go @@ -79,7 +79,7 @@ func TestPairing(t *testing.T) { var b, c, d GT b.Exp(&a, _e) - c.ExpGLV(&a, &_e) + c.ExpGLV(&a, _e) d.CyclotomicExp(&a, _e) return b.Equal(&c) && c.Equal(&d) @@ -410,7 +410,7 @@ func BenchmarkExpGT(b *testing.B) { b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.ExpGLV(&a, &_e) + a.ExpGLV(&a, _e) } }) } diff --git a/ecc/bls24-317/pairing_test.go b/ecc/bls24-317/pairing_test.go index b0025e26af..b4a4595f81 100644 --- a/ecc/bls24-317/pairing_test.go +++ b/ecc/bls24-317/pairing_test.go @@ -78,7 +78,7 @@ func TestPairing(t *testing.T) { var b, c, d GT b.Exp(&a, _e) - c.ExpGLV(&a, &_e) + c.ExpGLV(&a, _e) d.CyclotomicExp(&a, _e) return b.Equal(&c) && c.Equal(&d) @@ -408,7 +408,7 @@ func BenchmarkExpGT(b *testing.B) { b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.ExpGLV(&a, &_e) + a.ExpGLV(&a, _e) } }) } diff --git a/ecc/bn254/internal/fptower/e12.go b/ecc/bn254/internal/fptower/e12.go index 59ca53395f..43f09f1f80 100644 --- a/ecc/bn254/internal/fptower/e12.go +++ b/ecc/bn254/internal/fptower/e12.go @@ -23,8 +23,15 @@ import ( "github.com/consensys/gnark-crypto/ecc/bn254/fp" "github.com/consensys/gnark-crypto/ecc/bn254/fr" "math/big" + "sync" ) +var bigIntPool = sync.Pool{ + New: func() interface{} { + return new(big.Int) + }, +} + // E12 is a degree two finite field extension of fp6 type E12 struct { C0, C1 E6 @@ -408,9 +415,25 @@ func BatchInvertE12(a []E12) []E12 { return res } -// Exp sets z=x**e and returns it +// Exp sets z=x**k and returns it // uses 2-bits windowed method -func (z *E12) Exp(x *E12, e big.Int) *E12 { +func (z *E12) Exp(x *E12, k big.Int) *E12 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } var res E12 var ops [3]E12 @@ -438,11 +461,28 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 { return z } -// CyclotomicExp sets z=x**e and returns it +// CyclotomicExp sets z=x**k and returns it // uses 2-NAF decomposition // x must be in the cyclotomic subgroup // TODO: use a windowed method -func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { +func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } + var res, xInv E12 xInv.InverseUnitary(x) res.SetOne() @@ -460,37 +500,53 @@ func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { return z } -// ExpGLV sets z=x**e and returns it +// ExpGLV sets z=x**k and returns it // uses 2-dimensional GLV with 2-bits windowed method // x must be in GT // TODO: use 2-NAF // TODO: use higher dimensional decomposition -func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 { +func (z *E12) ExpGLV(x *E12, k big.Int) *E12 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } var table [15]E12 var res E12 - var k1, k2 fr.Element + var s1, s2 fr.Element res.SetOne() - // table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a - table[0].Set(a) - table[3].Frobenius(a) + // table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x + table[0].Set(x) + table[3].Frobenius(x) // split the scalar, modifies +-x, Frob(x) accordingly - k := ecc.SplitScalar(s, &glvBasis) + s := ecc.SplitScalar(&e, &glvBasis) - if k[0].Sign() == -1 { - k[0].Neg(&k[0]) + if s[0].Sign() == -1 { + s[0].Neg(&s[0]) table[0].InverseUnitary(&table[0]) } - if k[1].Sign() == -1 { - k[1].Neg(&k[1]) + if s[1].Sign() == -1 { + s[1].Neg(&s[1]) table[3].InverseUnitary(&table[3]) } // precompute table (2 bits sliding window) - // table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0 + // table[b3b2b1b0-1] = b3b2*Frobenius(x) + b1b0*x if b3b2b1b0 != 0 table[1].CyclotomicSquare(&table[0]) table[2].Mul(&table[1], &table[0]) table[4].Mul(&table[3], &table[0]) @@ -505,17 +561,17 @@ func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 { table[13].Mul(&table[11], &table[1]) table[14].Mul(&table[11], &table[2]) - // bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max - k1.SetBigInt(&k[0]).FromMont() - k2.SetBigInt(&k[1]).FromMont() + // bounds on the lattice base vectors guarantee that s1, s2 are len(r)/2 bits long max + s1.SetBigInt(&s[0]).FromMont() + s2.SetBigInt(&s[1]).FromMont() - // loop starts from len(k1)/2 due to the bounds - for i := len(k1) / 2; i >= 0; i-- { + // loop starts from len(s1)/2 due to the bounds + for i := len(s1) / 2; i >= 0; i-- { mask := uint64(3) << 62 for j := 0; j < 32; j++ { res.CyclotomicSquare(&res).CyclotomicSquare(&res) - b1 := (k1[i] & mask) >> (62 - 2*j) - b2 := (k2[i] & mask) >> (62 - 2*j) + b1 := (s1[i] & mask) >> (62 - 2*j) + b2 := (s2[i] & mask) >> (62 - 2*j) if b1|b2 != 0 { s := (b2<<2 | b1) res.Mul(&res, &table[s-1]) @@ -524,8 +580,8 @@ func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 { } } - p.Set(&res) - return p + z.Set(&res) + return z } // InverseUnitary inverse a unitary element diff --git a/ecc/bn254/internal/fptower/e2.go b/ecc/bn254/internal/fptower/e2.go index fe7e11343c..f6bec9b381 100644 --- a/ecc/bn254/internal/fptower/e2.go +++ b/ecc/bn254/internal/fptower/e2.go @@ -171,9 +171,26 @@ func (z *E2) Legendre() int { } // Exp sets z=x**e and returns it -func (z *E2) Exp(x E2, exponent *big.Int) *E2 { +func (z *E2) Exp(x E2, e *big.Int) *E2 { + if e.IsUint64() && e.Uint64() == 0 { + return z.SetOne() + } + + k := e + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12) + x.Inverse(&x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + k = bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(k) + k.Neg(k) + } + z.SetOne() - b := exponent.Bytes() + b := e.Bytes() for i := 0; i < len(b); i++ { w := b[i] for j := 0; j < 8; j++ { diff --git a/ecc/bn254/pairing_test.go b/ecc/bn254/pairing_test.go index 4fb2a98dc2..e342770160 100644 --- a/ecc/bn254/pairing_test.go +++ b/ecc/bn254/pairing_test.go @@ -77,7 +77,7 @@ func TestPairing(t *testing.T) { var b, c, d GT b.Exp(&a, _e) - c.ExpGLV(&a, &_e) + c.ExpGLV(&a, _e) d.CyclotomicExp(&a, _e) return b.Equal(&c) && c.Equal(&d) @@ -407,7 +407,7 @@ func BenchmarkExpGT(b *testing.B) { b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.ExpGLV(&a, &_e) + a.ExpGLV(&a, _e) } }) } diff --git a/ecc/bw6-633/pairing_test.go b/ecc/bw6-633/pairing_test.go index be5c03b17f..a8ed3ab2dc 100644 --- a/ecc/bw6-633/pairing_test.go +++ b/ecc/bw6-633/pairing_test.go @@ -79,7 +79,7 @@ func TestPairing(t *testing.T) { var b, c, d GT b.Exp(&a, _e) - c.ExpGLV(&a, &_e) + c.ExpGLV(&a, _e) d.CyclotomicExp(&a, _e) return b.Equal(&c) && c.Equal(&d) @@ -410,7 +410,7 @@ func BenchmarkExpGT(b *testing.B) { b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.ExpGLV(&a, &_e) + a.ExpGLV(&a, _e) } }) } diff --git a/ecc/bw6-756/pairing_test.go b/ecc/bw6-756/pairing_test.go index 7b7aaef22b..466a7c797d 100644 --- a/ecc/bw6-756/pairing_test.go +++ b/ecc/bw6-756/pairing_test.go @@ -78,7 +78,7 @@ func TestPairing(t *testing.T) { var b, c, d GT b.Exp(&a, _e) - c.ExpGLV(&a, &_e) + c.ExpGLV(&a, _e) d.CyclotomicExp(&a, _e) return b.Equal(&c) && c.Equal(&d) @@ -408,7 +408,7 @@ func BenchmarkExpGT(b *testing.B) { b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.ExpGLV(&a, &_e) + a.ExpGLV(&a, _e) } }) } diff --git a/ecc/bw6-761/pairing_test.go b/ecc/bw6-761/pairing_test.go index 5286484779..323deef40f 100644 --- a/ecc/bw6-761/pairing_test.go +++ b/ecc/bw6-761/pairing_test.go @@ -79,7 +79,7 @@ func TestPairing(t *testing.T) { var b, c, d GT b.Exp(&a, _e) - c.ExpGLV(&a, &_e) + c.ExpGLV(&a, _e) d.CyclotomicExp(&a, _e) return b.Equal(&c) && c.Equal(&d) @@ -410,7 +410,7 @@ func BenchmarkExpGT(b *testing.B) { b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.ExpGLV(&a, &_e) + a.ExpGLV(&a, _e) } }) } diff --git a/internal/generator/pairing/template/tests/pairing.go.tmpl b/internal/generator/pairing/template/tests/pairing.go.tmpl index c6bcbe6163..d77d7e45c8 100644 --- a/internal/generator/pairing/template/tests/pairing.go.tmpl +++ b/internal/generator/pairing/template/tests/pairing.go.tmpl @@ -70,7 +70,7 @@ func TestPairing(t *testing.T) { var b, c, d GT b.Exp(&a, _e) - c.ExpGLV(&a, &_e) + c.ExpGLV(&a, _e) d.CyclotomicExp(&a, _e) return b.Equal(&c) && c.Equal(&d) @@ -415,7 +415,7 @@ func BenchmarkExpGT(b *testing.B) { b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.ExpGLV(&a, &_e) + a.ExpGLV(&a, _e) } }) } diff --git a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl index a0908372e3..5975f193b3 100644 --- a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl +++ b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl @@ -2,11 +2,18 @@ import ( "math/big" "encoding/binary" "errors" + "sync" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/{{.Curve.Name}}/fp" "github.com/consensys/gnark-crypto/ecc/{{.Curve.Name}}/fr" ) +var bigIntPool = sync.Pool{ + New: func() interface{} { + return new(big.Int) + }, +} + // E12 is a degree two finite field extension of fp6 type E12 struct { C0, C1 E6 @@ -391,9 +398,25 @@ func BatchInvertE12(a []E12) []E12 { return res } -// Exp sets z=x**e and returns it +// Exp sets z=x**k and returns it // uses 2-bits windowed method -func (z *E12) Exp(x *E12, e big.Int) *E12 { +func (z *E12) Exp(x *E12, k big.Int) *E12 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } var res E12 var ops [3]E12 @@ -421,11 +444,28 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 { return z } -// CyclotomicExp sets z=x**e and returns it +// CyclotomicExp sets z=x**k and returns it // uses 2-NAF decomposition // x must be in the cyclotomic subgroup // TODO: use a windowed method -func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { +func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } + var res, xInv E12 xInv.InverseUnitary(x) res.SetOne() @@ -443,37 +483,53 @@ func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 { return z } -// ExpGLV sets z=x**e and returns it +// ExpGLV sets z=x**k and returns it // uses 2-dimensional GLV with 2-bits windowed method // x must be in GT // TODO: use 2-NAF // TODO: use higher dimensional decomposition -func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 { +func (z *E12) ExpGLV(x *E12, k big.Int) *E12 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } - var table [15]E12 + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } + + var table [15]E12 var res E12 - var k1, k2 fr.Element + var s1, s2 fr.Element res.SetOne() - // table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a - table[0].Set(a) - table[3].Frobenius(a) + // table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x + table[0].Set(x) + table[3].Frobenius(x) // split the scalar, modifies +-x, Frob(x) accordingly - k := ecc.SplitScalar(s, &glvBasis) + s := ecc.SplitScalar(&e, &glvBasis) - if k[0].Sign() == -1 { - k[0].Neg(&k[0]) + if s[0].Sign() == -1 { + s[0].Neg(&s[0]) table[0].InverseUnitary(&table[0]) } - if k[1].Sign() == -1 { - k[1].Neg(&k[1]) + if s[1].Sign() == -1 { + s[1].Neg(&s[1]) table[3].InverseUnitary(&table[3]) } // precompute table (2 bits sliding window) - // table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0 + // table[b3b2b1b0-1] = b3b2*Frobenius(x) + b1b0*x if b3b2b1b0 != 0 table[1].CyclotomicSquare(&table[0]) table[2].Mul(&table[1], &table[0]) table[4].Mul(&table[3], &table[0]) @@ -488,17 +544,17 @@ func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 { table[13].Mul(&table[11], &table[1]) table[14].Mul(&table[11], &table[2]) - // bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max - k1.SetBigInt(&k[0]).FromMont() - k2.SetBigInt(&k[1]).FromMont() + // bounds on the lattice base vectors guarantee that s1, s2 are len(r)/2 bits long max + s1.SetBigInt(&s[0]).FromMont() + s2.SetBigInt(&s[1]).FromMont() - // loop starts from len(k1)/2 due to the bounds - for i := len(k1) / 2; i >= 0; i-- { + // loop starts from len(s1)/2 due to the bounds + for i := len(s1) / 2; i >= 0; i-- { mask := uint64(3) << 62 for j := 0; j < 32; j++ { res.CyclotomicSquare(&res).CyclotomicSquare(&res) - b1 := (k1[i] & mask) >> (62 - 2*j) - b2 := (k2[i] & mask) >> (62 - 2*j) + b1 := (s1[i] & mask) >> (62 - 2*j) + b2 := (s2[i] & mask) >> (62 - 2*j) if b1|b2 != 0 { s := (b2<<2 | b1) res.Mul(&res, &table[s-1]) @@ -507,8 +563,8 @@ func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 { } } - p.Set(&res) - return p + z.Set(&res) + return z } // InverseUnitary inverse a unitary element diff --git a/internal/generator/tower/template/fq12over6over2/fq2.go.tmpl b/internal/generator/tower/template/fq12over6over2/fq2.go.tmpl index 9640e1ab77..35ed62c3e7 100644 --- a/internal/generator/tower/template/fq12over6over2/fq2.go.tmpl +++ b/internal/generator/tower/template/fq12over6over2/fq2.go.tmpl @@ -4,7 +4,6 @@ import ( "github.com/consensys/gnark-crypto/ecc/{{.Curve.Name}}/fp" ) - // E2 is a degree two finite field extension of fp.Element type E2 struct { A0, A1 fp.Element @@ -143,7 +142,7 @@ func (z *E2) Conjugate(x *E2) *E2 { return z } -// Halve sets z = z / 2 +// Halve sets z = z / 2 func (z *E2) Halve() { z.A0.Halve() z.A1.Halve() @@ -157,9 +156,26 @@ func (z *E2) Legendre() int { } // Exp sets z=x**e and returns it -func (z *E2) Exp(x E2, exponent *big.Int) *E2 { +func (z *E2) Exp(x E2, e *big.Int) *E2 { + if e.IsUint64() && e.Uint64() == 0 { + return z.SetOne() + } + + k := e + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12) + x.Inverse(&x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + k = bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(k) + k.Neg(k) + } + z.SetOne() - b := exponent.Bytes() + b := e.Bytes() for i :=0;i Date: Sat, 18 Jun 2022 16:28:55 +0100 Subject: [PATCH 11/16] fix: handle negative exponent in Fp24 and Fp6 exp --- ecc/bls24-315/internal/fptower/e2.go | 21 ++++- ecc/bls24-315/internal/fptower/e24.go | 106 ++++++++++++++++++++------ ecc/bls24-315/internal/fptower/e4.go | 17 +++++ ecc/bls24-317/internal/fptower/e2.go | 21 ++++- ecc/bls24-317/internal/fptower/e24.go | 106 ++++++++++++++++++++------ ecc/bls24-317/internal/fptower/e4.go | 17 +++++ ecc/bw6-633/internal/fptower/e6.go | 106 ++++++++++++++++++++------ ecc/bw6-756/internal/fptower/e6.go | 106 ++++++++++++++++++++------ ecc/bw6-761/internal/fptower/e6.go | 106 ++++++++++++++++++++------ 9 files changed, 477 insertions(+), 129 deletions(-) diff --git a/ecc/bls24-315/internal/fptower/e2.go b/ecc/bls24-315/internal/fptower/e2.go index e026fee0e0..774c4c1d48 100644 --- a/ecc/bls24-315/internal/fptower/e2.go +++ b/ecc/bls24-315/internal/fptower/e2.go @@ -164,9 +164,26 @@ func (z *E2) Legendre() int { } // Exp sets z=x**e and returns it -func (z *E2) Exp(x E2, exponent *big.Int) *E2 { +func (z *E2) Exp(x E2, e *big.Int) *E2 { + if e.IsUint64() && e.Uint64() == 0 { + return z.SetOne() + } + + k := e + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12) + x.Inverse(&x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + k = bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(k) + k.Neg(k) + } + z.SetOne() - b := exponent.Bytes() + b := e.Bytes() for i := 0; i < len(b); i++ { w := b[i] for j := 0; j < 8; j++ { diff --git a/ecc/bls24-315/internal/fptower/e24.go b/ecc/bls24-315/internal/fptower/e24.go index bedfba214c..a935af6e11 100644 --- a/ecc/bls24-315/internal/fptower/e24.go +++ b/ecc/bls24-315/internal/fptower/e24.go @@ -21,8 +21,15 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls24-315/fr" "math/big" + "sync" ) +var bigIntPool = sync.Pool{ + New: func() interface{} { + return new(big.Int) + }, +} + // E24 is a degree two finite field extension of fp6 type E24 struct { D0, D1 E12 @@ -406,9 +413,25 @@ func BatchInvertE24(a []E24) []E24 { return res } -// Exp sets z=x**e and returns it +// Exp sets z=x**k and returns it // uses 2-bits windowed method -func (z *E24) Exp(x *E24, e big.Int) *E24 { +func (z *E24) Exp(x *E24, k big.Int) *E24 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } var res E24 var ops [3]E24 @@ -436,11 +459,28 @@ func (z *E24) Exp(x *E24, e big.Int) *E24 { return z } -// CyclotomicExp sets z=x**e and returns it +// CyclotomicExp sets z=x**k and returns it // uses 2-NAF decomposition // x must be in the cyclotomic subgroup // TODO: use a windowed method -func (z *E24) CyclotomicExp(x *E24, e big.Int) *E24 { +func (z *E24) CyclotomicExp(x *E24, k big.Int) *E24 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } + var res, xInv E24 xInv.InverseUnitary(x) res.SetOne() @@ -458,37 +498,53 @@ func (z *E24) CyclotomicExp(x *E24, e big.Int) *E24 { return z } -// ExpGLV sets z=x**e and returns it +// ExpGLV sets z=x**k and returns it // uses 2-dimensional GLV with 2-bits windowed method // x must be in GT // TODO: use 2-NAF // TODO: use higher dimensional decomposition -func (p *E24) ExpGLV(a *E24, s *big.Int) *E24 { +func (z *E24) ExpGLV(x *E24, k big.Int) *E24 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } var table [15]E24 var res E24 - var k1, k2 fr.Element + var s1, s2 fr.Element res.SetOne() - // table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a - table[0].Set(a) - table[3].Frobenius(a) + // table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x + table[0].Set(x) + table[3].Frobenius(x) // split the scalar, modifies +-x, Frob(x) accordingly - k := ecc.SplitScalar(s, &glvBasis) + s := ecc.SplitScalar(&e, &glvBasis) - if k[0].Sign() == -1 { - k[0].Neg(&k[0]) + if s[0].Sign() == -1 { + s[0].Neg(&s[0]) table[0].InverseUnitary(&table[0]) } - if k[1].Sign() == -1 { - k[1].Neg(&k[1]) + if s[1].Sign() == -1 { + s[1].Neg(&s[1]) table[3].InverseUnitary(&table[3]) } // precompute table (2 bits sliding window) - // table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0 + // table[b3b2b1b0-1] = b3b2*Frobenius(x) + b1b0*x if b3b2b1b0 != 0 table[1].CyclotomicSquare(&table[0]) table[2].Mul(&table[1], &table[0]) table[4].Mul(&table[3], &table[0]) @@ -503,17 +559,17 @@ func (p *E24) ExpGLV(a *E24, s *big.Int) *E24 { table[13].Mul(&table[11], &table[1]) table[14].Mul(&table[11], &table[2]) - // bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max - k1.SetBigInt(&k[0]).FromMont() - k2.SetBigInt(&k[1]).FromMont() + // bounds on the lattice base vectors guarantee that s1, s2 are len(r)/2 bits long max + s1.SetBigInt(&s[0]).FromMont() + s2.SetBigInt(&s[1]).FromMont() - // loop starts from len(k1)/2 due to the bounds - for i := len(k1)/2 + 1; i >= 0; i-- { + // loop starts from len(s1)/2 due to the bounds + for i := len(s1)/2 + 1; i >= 0; i-- { mask := uint64(3) << 62 for j := 0; j < 32; j++ { res.CyclotomicSquare(&res).CyclotomicSquare(&res) - b1 := (k1[i] & mask) >> (62 - 2*j) - b2 := (k2[i] & mask) >> (62 - 2*j) + b1 := (s1[i] & mask) >> (62 - 2*j) + b2 := (s2[i] & mask) >> (62 - 2*j) if b1|b2 != 0 { s := (b2<<2 | b1) res.Mul(&res, &table[s-1]) @@ -522,8 +578,8 @@ func (p *E24) ExpGLV(a *E24, s *big.Int) *E24 { } } - p.Set(&res) - return p + z.Set(&res) + return z } // InverseUnitary inverse a unitary element diff --git a/ecc/bls24-315/internal/fptower/e4.go b/ecc/bls24-315/internal/fptower/e4.go index 830f988691..d73f787dd1 100644 --- a/ecc/bls24-315/internal/fptower/e4.go +++ b/ecc/bls24-315/internal/fptower/e4.go @@ -217,6 +217,23 @@ func (z *E4) Inverse(x *E4) *E4 { // Exp sets z=x**e and returns it func (z *E4) Exp(x *E4, e big.Int) *E4 { + if e.IsUint64() && e.Uint64() == 0 { + return z.SetOne() + } + + k := e + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + k = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(k) + k.Neg(&k) + } + var res E4 res.SetOne() b := e.Bytes() diff --git a/ecc/bls24-317/internal/fptower/e2.go b/ecc/bls24-317/internal/fptower/e2.go index 25d035ea80..f5d018924f 100644 --- a/ecc/bls24-317/internal/fptower/e2.go +++ b/ecc/bls24-317/internal/fptower/e2.go @@ -163,9 +163,26 @@ func (z *E2) Legendre() int { } // Exp sets z=x**e and returns it -func (z *E2) Exp(x E2, exponent *big.Int) *E2 { +func (z *E2) Exp(x E2, e *big.Int) *E2 { + if e.IsUint64() && e.Uint64() == 0 { + return z.SetOne() + } + + k := e + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12) + x.Inverse(&x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + k = bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(k) + k.Neg(k) + } + z.SetOne() - b := exponent.Bytes() + b := e.Bytes() for i := 0; i < len(b); i++ { w := b[i] for j := 0; j < 8; j++ { diff --git a/ecc/bls24-317/internal/fptower/e24.go b/ecc/bls24-317/internal/fptower/e24.go index 48384aa007..f25d66539d 100644 --- a/ecc/bls24-317/internal/fptower/e24.go +++ b/ecc/bls24-317/internal/fptower/e24.go @@ -21,8 +21,15 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls24-317/fr" "math/big" + "sync" ) +var bigIntPool = sync.Pool{ + New: func() interface{} { + return new(big.Int) + }, +} + // E24 is a degree two finite field extension of fp6 type E24 struct { D0, D1 E12 @@ -406,9 +413,25 @@ func BatchInvertE24(a []E24) []E24 { return res } -// Exp sets z=x**e and returns it +// Exp sets z=x**k and returns it // uses 2-bits windowed method -func (z *E24) Exp(x *E24, e big.Int) *E24 { +func (z *E24) Exp(x *E24, k big.Int) *E24 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } var res E24 var ops [3]E24 @@ -436,11 +459,28 @@ func (z *E24) Exp(x *E24, e big.Int) *E24 { return z } -// CyclotomicExp sets z=x**e and returns it +// CyclotomicExp sets z=x**k and returns it // uses 2-NAF decomposition // x must be in the cyclotomic subgroup // TODO: use a windowed method -func (z *E24) CyclotomicExp(x *E24, e big.Int) *E24 { +func (z *E24) CyclotomicExp(x *E24, k big.Int) *E24 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } + var res, xInv E24 xInv.InverseUnitary(x) res.SetOne() @@ -458,37 +498,53 @@ func (z *E24) CyclotomicExp(x *E24, e big.Int) *E24 { return z } -// ExpGLV sets z=x**e and returns it +// ExpGLV sets z=x**k and returns it // uses 2-dimensional GLV with 2-bits windowed method // x must be in GT // TODO: use 2-NAF // TODO: use higher dimensional decomposition -func (p *E24) ExpGLV(a *E24, s *big.Int) *E24 { +func (z *E24) ExpGLV(x *E24, k big.Int) *E24 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } var table [15]E24 var res E24 - var k1, k2 fr.Element + var s1, s2 fr.Element res.SetOne() - // table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a - table[0].Set(a) - table[3].Frobenius(a) + // table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x + table[0].Set(x) + table[3].Frobenius(x) // split the scalar, modifies +-x, Frob(x) accordingly - k := ecc.SplitScalar(s, &glvBasis) + s := ecc.SplitScalar(&e, &glvBasis) - if k[0].Sign() == -1 { - k[0].Neg(&k[0]) + if s[0].Sign() == -1 { + s[0].Neg(&s[0]) table[0].InverseUnitary(&table[0]) } - if k[1].Sign() == -1 { - k[1].Neg(&k[1]) + if s[1].Sign() == -1 { + s[1].Neg(&s[1]) table[3].InverseUnitary(&table[3]) } // precompute table (2 bits sliding window) - // table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0 + // table[b3b2b1b0-1] = b3b2*Frobenius(x) + b1b0*x if b3b2b1b0 != 0 table[1].CyclotomicSquare(&table[0]) table[2].Mul(&table[1], &table[0]) table[4].Mul(&table[3], &table[0]) @@ -503,17 +559,17 @@ func (p *E24) ExpGLV(a *E24, s *big.Int) *E24 { table[13].Mul(&table[11], &table[1]) table[14].Mul(&table[11], &table[2]) - // bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max - k1.SetBigInt(&k[0]).FromMont() - k2.SetBigInt(&k[1]).FromMont() + // bounds on the lattice base vectors guarantee that s1, s2 are len(r)/2 bits long max + s1.SetBigInt(&s[0]).FromMont() + s2.SetBigInt(&s[1]).FromMont() - // loop starts from len(k1)/2 due to the bounds - for i := len(k1)/2 + 1; i >= 0; i-- { + // loop starts from len(s1)/2 due to the bounds + for i := len(s1)/2 + 1; i >= 0; i-- { mask := uint64(3) << 62 for j := 0; j < 32; j++ { res.CyclotomicSquare(&res).CyclotomicSquare(&res) - b1 := (k1[i] & mask) >> (62 - 2*j) - b2 := (k2[i] & mask) >> (62 - 2*j) + b1 := (s1[i] & mask) >> (62 - 2*j) + b2 := (s2[i] & mask) >> (62 - 2*j) if b1|b2 != 0 { s := (b2<<2 | b1) res.Mul(&res, &table[s-1]) @@ -522,8 +578,8 @@ func (p *E24) ExpGLV(a *E24, s *big.Int) *E24 { } } - p.Set(&res) - return p + z.Set(&res) + return z } // InverseUnitary inverse a unitary element diff --git a/ecc/bls24-317/internal/fptower/e4.go b/ecc/bls24-317/internal/fptower/e4.go index 2c84e6d1c3..bed1311c0d 100644 --- a/ecc/bls24-317/internal/fptower/e4.go +++ b/ecc/bls24-317/internal/fptower/e4.go @@ -218,6 +218,23 @@ func (z *E4) Inverse(x *E4) *E4 { // Exp sets z=x**e and returns it func (z *E4) Exp(x *E4, e big.Int) *E4 { + if e.IsUint64() && e.Uint64() == 0 { + return z.SetOne() + } + + k := e + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + k = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(k) + k.Neg(&k) + } + var res E4 res.SetOne() b := e.Bytes() diff --git a/ecc/bw6-633/internal/fptower/e6.go b/ecc/bw6-633/internal/fptower/e6.go index 71a59cf64d..fdc1afa39c 100644 --- a/ecc/bw6-633/internal/fptower/e6.go +++ b/ecc/bw6-633/internal/fptower/e6.go @@ -19,12 +19,19 @@ package fptower import ( "errors" "math/big" + "sync" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-633/fp" "github.com/consensys/gnark-crypto/ecc/bw6-633/fr" ) +var bigIntPool = sync.Pool{ + New: func() interface{} { + return new(big.Int) + }, +} + // E6 is a degree two finite field extension of fp3 type E6 struct { B0, B1 E3 @@ -351,9 +358,25 @@ func BatchInvertE6(a []E6) []E6 { return res } -// Exp sets z=x**e and returns it +// Exp sets z=x**k and returns it // uses 2-bits windowed method -func (z *E6) Exp(x *E6, e big.Int) *E6 { +func (z *E6) Exp(x *E6, k big.Int) *E6 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } var res E6 var ops [3]E6 @@ -381,11 +404,28 @@ func (z *E6) Exp(x *E6, e big.Int) *E6 { return z } -// CyclotomicExp sets z=x**e and returns it +// CyclotomicExp sets z=x**k and returns it // uses 2-NAF decomposition // x must be in the cyclotomic subgroup // TODO: use a windowed method -func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 { +func (z *E6) CyclotomicExp(x *E6, k big.Int) *E6 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } + var res, xInv E6 xInv.InverseUnitary(x) res.SetOne() @@ -403,37 +443,53 @@ func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 { return z } -// ExpGLV sets z=x**e and returns it +// ExpGLV sets z=x**k and returns it // uses 2-dimensional GLV with 2-bits windowed method // x must be in GT // TODO: use 2-NAF // TODO: use higher dimensional decomposition -func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 { +func (z *E6) ExpGLV(x *E6, k big.Int) *E6 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } var table [15]E6 var res E6 - var k1, k2 fr.Element + var s1, s2 fr.Element res.SetOne() - // table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a - table[0].Set(a) - table[3].Frobenius(a) + // table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x + table[0].Set(x) + table[3].Frobenius(x) // split the scalar, modifies +-x, Frob(x) accordingly - k := ecc.SplitScalar(s, &glvBasis) + s := ecc.SplitScalar(&e, &glvBasis) - if k[0].Sign() == -1 { - k[0].Neg(&k[0]) + if s[0].Sign() == -1 { + s[0].Neg(&s[0]) table[0].InverseUnitary(&table[0]) } - if k[1].Sign() == -1 { - k[1].Neg(&k[1]) + if s[1].Sign() == -1 { + s[1].Neg(&s[1]) table[3].InverseUnitary(&table[3]) } // precompute table (2 bits sliding window) - // table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0 + // table[b3b2b1b0-1] = b3b2*Frobenius(x) + b1b0*x if b3b2b1b0 != 0 table[1].CyclotomicSquare(&table[0]) table[2].Mul(&table[1], &table[0]) table[4].Mul(&table[3], &table[0]) @@ -448,17 +504,17 @@ func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 { table[13].Mul(&table[11], &table[1]) table[14].Mul(&table[11], &table[2]) - // bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max - k1.SetBigInt(&k[0]).FromMont() - k2.SetBigInt(&k[1]).FromMont() + // bounds on the lattice base vectors guarantee that s1, s2 are len(r)/2 bits long max + s1.SetBigInt(&s[0]).FromMont() + s2.SetBigInt(&s[1]).FromMont() - // loop starts from len(k1)/2 due to the bounds - for i := len(k1) / 2; i >= 0; i-- { + // loop starts from len(s1)/2 due to the bounds + for i := len(s1) / 2; i >= 0; i-- { mask := uint64(3) << 62 for j := 0; j < 32; j++ { res.CyclotomicSquare(&res).CyclotomicSquare(&res) - b1 := (k1[i] & mask) >> (62 - 2*j) - b2 := (k2[i] & mask) >> (62 - 2*j) + b1 := (s1[i] & mask) >> (62 - 2*j) + b2 := (s2[i] & mask) >> (62 - 2*j) if b1|b2 != 0 { s := (b2<<2 | b1) res.Mul(&res, &table[s-1]) @@ -467,8 +523,8 @@ func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 { } } - p.Set(&res) - return p + z.Set(&res) + return z } // InverseUnitary inverse a unitary element diff --git a/ecc/bw6-756/internal/fptower/e6.go b/ecc/bw6-756/internal/fptower/e6.go index e83a3deabe..16464db219 100644 --- a/ecc/bw6-756/internal/fptower/e6.go +++ b/ecc/bw6-756/internal/fptower/e6.go @@ -19,12 +19,19 @@ package fptower import ( "errors" "math/big" + "sync" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-756/fp" "github.com/consensys/gnark-crypto/ecc/bw6-756/fr" ) +var bigIntPool = sync.Pool{ + New: func() interface{} { + return new(big.Int) + }, +} + // E6 is a degree two finite field extension of fp3 type E6 struct { B0, B1 E3 @@ -350,9 +357,25 @@ func BatchInvertE6(a []E6) []E6 { return res } -// Exp sets z=x**e and returns it +// Exp sets z=x**k and returns it // uses 2-bits windowed method -func (z *E6) Exp(x *E6, e big.Int) *E6 { +func (z *E6) Exp(x *E6, k big.Int) *E6 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } var res E6 var ops [3]E6 @@ -380,11 +403,28 @@ func (z *E6) Exp(x *E6, e big.Int) *E6 { return z } -// CyclotomicExp sets z=x**e and returns it +// CyclotomicExp sets z=x**k and returns it // uses 2-NAF decomposition // x must be in the cyclotomic subgroup // TODO: use a windowed method -func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 { +func (z *E6) CyclotomicExp(x *E6, k big.Int) *E6 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } + var res, xInv E6 xInv.InverseUnitary(x) res.SetOne() @@ -402,37 +442,53 @@ func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 { return z } -// ExpGLV sets z=x**e and returns it +// ExpGLV sets z=x**k and returns it // uses 2-dimensional GLV with 2-bits windowed method // x must be in GT // TODO: use 2-NAF // TODO: use higher dimensional decomposition -func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 { +func (z *E6) ExpGLV(x *E6, k big.Int) *E6 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } var table [15]E6 var res E6 - var k1, k2 fr.Element + var s1, s2 fr.Element res.SetOne() - // table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a - table[0].Set(a) - table[3].Frobenius(a) + // table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x + table[0].Set(x) + table[3].Frobenius(x) // split the scalar, modifies +-x, Frob(x) accordingly - k := ecc.SplitScalar(s, &glvBasis) + s := ecc.SplitScalar(&e, &glvBasis) - if k[0].Sign() == -1 { - k[0].Neg(&k[0]) + if s[0].Sign() == -1 { + s[0].Neg(&s[0]) table[0].InverseUnitary(&table[0]) } - if k[1].Sign() == -1 { - k[1].Neg(&k[1]) + if s[1].Sign() == -1 { + s[1].Neg(&s[1]) table[3].InverseUnitary(&table[3]) } // precompute table (2 bits sliding window) - // table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0 + // table[b3b2b1b0-1] = b3b2*Frobenius(x) + b1b0*x if b3b2b1b0 != 0 table[1].CyclotomicSquare(&table[0]) table[2].Mul(&table[1], &table[0]) table[4].Mul(&table[3], &table[0]) @@ -447,17 +503,17 @@ func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 { table[13].Mul(&table[11], &table[1]) table[14].Mul(&table[11], &table[2]) - // bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max - k1.SetBigInt(&k[0]).FromMont() - k2.SetBigInt(&k[1]).FromMont() + // bounds on the lattice base vectors guarantee that s1, s2 are len(r)/2 bits long max + s1.SetBigInt(&s[0]).FromMont() + s2.SetBigInt(&s[1]).FromMont() - // loop starts from len(k1)/2 due to the bounds - for i := len(k1) / 2; i >= 0; i-- { + // loop starts from len(s1)/2 due to the bounds + for i := len(s1) / 2; i >= 0; i-- { mask := uint64(3) << 62 for j := 0; j < 32; j++ { res.CyclotomicSquare(&res).CyclotomicSquare(&res) - b1 := (k1[i] & mask) >> (62 - 2*j) - b2 := (k2[i] & mask) >> (62 - 2*j) + b1 := (s1[i] & mask) >> (62 - 2*j) + b2 := (s2[i] & mask) >> (62 - 2*j) if b1|b2 != 0 { s := (b2<<2 | b1) res.Mul(&res, &table[s-1]) @@ -466,8 +522,8 @@ func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 { } } - p.Set(&res) - return p + z.Set(&res) + return z } // InverseUnitary inverse a unitary element diff --git a/ecc/bw6-761/internal/fptower/e6.go b/ecc/bw6-761/internal/fptower/e6.go index 87cee347f2..0f914d18d0 100644 --- a/ecc/bw6-761/internal/fptower/e6.go +++ b/ecc/bw6-761/internal/fptower/e6.go @@ -19,12 +19,19 @@ package fptower import ( "errors" "math/big" + "sync" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-761/fp" "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" ) +var bigIntPool = sync.Pool{ + New: func() interface{} { + return new(big.Int) + }, +} + // E6 is a degree two finite field extension of fp3 type E6 struct { B0, B1 E3 @@ -350,9 +357,25 @@ func BatchInvertE6(a []E6) []E6 { return res } -// Exp sets z=x**e and returns it +// Exp sets z=x**k and returns it // uses 2-bits windowed method -func (z *E6) Exp(x *E6, e big.Int) *E6 { +func (z *E6) Exp(x *E6, k big.Int) *E6 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } var res E6 var ops [3]E6 @@ -380,11 +403,28 @@ func (z *E6) Exp(x *E6, e big.Int) *E6 { return z } -// CyclotomicExp sets z=x**e and returns it +// CyclotomicExp sets z=x**k and returns it // uses 2-NAF decomposition // x must be in the cyclotomic subgroup // TODO: use a windowed method -func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 { +func (z *E6) CyclotomicExp(x *E6, k big.Int) *E6 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } + var res, xInv E6 xInv.InverseUnitary(x) res.SetOne() @@ -402,37 +442,53 @@ func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 { return z } -// ExpGLV sets z=x**e and returns it +// ExpGLV sets z=x**k and returns it // uses 2-dimensional GLV with 2-bits windowed method // x must be in GT // TODO: use 2-NAF // TODO: use higher dimensional decomposition -func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 { +func (z *E6) ExpGLV(x *E6, k big.Int) *E6 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = *bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(&k) + } var table [15]E6 var res E6 - var k1, k2 fr.Element + var s1, s2 fr.Element res.SetOne() - // table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a - table[0].Set(a) - table[3].Frobenius(a) + // table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x + table[0].Set(x) + table[3].Frobenius(x) // split the scalar, modifies +-x, Frob(x) accordingly - k := ecc.SplitScalar(s, &glvBasis) + s := ecc.SplitScalar(&e, &glvBasis) - if k[0].Sign() == -1 { - k[0].Neg(&k[0]) + if s[0].Sign() == -1 { + s[0].Neg(&s[0]) table[0].InverseUnitary(&table[0]) } - if k[1].Sign() == -1 { - k[1].Neg(&k[1]) + if s[1].Sign() == -1 { + s[1].Neg(&s[1]) table[3].InverseUnitary(&table[3]) } // precompute table (2 bits sliding window) - // table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0 + // table[b3b2b1b0-1] = b3b2*Frobenius(x) + b1b0*x if b3b2b1b0 != 0 table[1].CyclotomicSquare(&table[0]) table[2].Mul(&table[1], &table[0]) table[4].Mul(&table[3], &table[0]) @@ -447,17 +503,17 @@ func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 { table[13].Mul(&table[11], &table[1]) table[14].Mul(&table[11], &table[2]) - // bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max - k1.SetBigInt(&k[0]).FromMont() - k2.SetBigInt(&k[1]).FromMont() + // bounds on the lattice base vectors guarantee that s1, s2 are len(r)/2 bits long max + s1.SetBigInt(&s[0]).FromMont() + s2.SetBigInt(&s[1]).FromMont() - // loop starts from len(k1)/2 due to the bounds - for i := len(k1) / 2; i >= 0; i-- { + // loop starts from len(s1)/2 due to the bounds + for i := len(s1) / 2; i >= 0; i-- { mask := uint64(3) << 62 for j := 0; j < 32; j++ { res.CyclotomicSquare(&res).CyclotomicSquare(&res) - b1 := (k1[i] & mask) >> (62 - 2*j) - b2 := (k2[i] & mask) >> (62 - 2*j) + b1 := (s1[i] & mask) >> (62 - 2*j) + b2 := (s2[i] & mask) >> (62 - 2*j) if b1|b2 != 0 { s := (b2<<2 | b1) res.Mul(&res, &table[s-1]) @@ -466,8 +522,8 @@ func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 { } } - p.Set(&res) - return p + z.Set(&res) + return z } // InverseUnitary inverse a unitary element From e382eb6c95f156abd0a5ecfca2856585e99d0c2b Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Mon, 20 Jun 2022 11:02:19 +0100 Subject: [PATCH 12/16] fix: golangci-lint --- ecc/bls12-377/internal/fptower/e12.go | 54 +++---- ecc/bls12-377/internal/fptower/e12_test.go | 8 +- ecc/bls12-377/internal/fptower/e2.go | 16 +- ecc/bls12-377/pairing_test.go | 20 +-- ecc/bls12-378/internal/fptower/e12.go | 54 +++---- ecc/bls12-378/internal/fptower/e12_test.go | 8 +- ecc/bls12-378/internal/fptower/e2.go | 16 +- ecc/bls12-378/pairing_test.go | 20 +-- ecc/bls12-381/internal/fptower/e12.go | 54 +++---- ecc/bls12-381/internal/fptower/e12_test.go | 8 +- ecc/bls12-381/internal/fptower/e2.go | 16 +- ecc/bls12-381/pairing_test.go | 20 +-- ecc/bls24-315/internal/fptower/e12.go | 42 ++++- ecc/bls24-315/internal/fptower/e12_test.go | 2 +- ecc/bls24-315/internal/fptower/e2.go | 16 +- ecc/bls24-315/internal/fptower/e24.go | 126 +++++++-------- ecc/bls24-315/internal/fptower/e24_test.go | 10 +- ecc/bls24-315/internal/fptower/e4.go | 39 +++-- ecc/bls24-315/internal/fptower/e4_test.go | 2 +- ecc/bls24-315/pairing_test.go | 20 +-- ecc/bls24-317/internal/fptower/e12.go | 42 ++++- ecc/bls24-317/internal/fptower/e12_test.go | 2 +- ecc/bls24-317/internal/fptower/e2.go | 16 +- ecc/bls24-317/internal/fptower/e24.go | 126 +++++++-------- ecc/bls24-317/internal/fptower/e24_test.go | 10 +- ecc/bls24-317/internal/fptower/e4.go | 41 +++-- ecc/bls24-317/internal/fptower/e4_test.go | 2 +- ecc/bls24-317/pairing_test.go | 20 +-- ecc/bn254/internal/fptower/e12.go | 54 +++---- ecc/bn254/internal/fptower/e12_pairing.go | 2 +- ecc/bn254/internal/fptower/e12_test.go | 8 +- ecc/bn254/internal/fptower/e2.go | 16 +- ecc/bn254/pairing_test.go | 20 +-- ecc/bw6-633/internal/fptower/e6.go | 152 +++++++++--------- ecc/bw6-633/internal/fptower/e6_test.go | 6 +- ecc/bw6-633/pairing_test.go | 20 +-- ecc/bw6-756/internal/fptower/e6.go | 120 +++++++------- ecc/bw6-756/internal/fptower/e6_test.go | 2 +- ecc/bw6-756/pairing_test.go | 20 +-- ecc/bw6-761/internal/fptower/e6.go | 138 ++++++++-------- ecc/bw6-761/internal/fptower/e6_test.go | 6 +- ecc/bw6-761/pairing_test.go | 20 +-- .../pairing/template/tests/pairing.go.tmpl | 20 +-- .../template/fq12over6over2/fq12.go.tmpl | 56 +++---- .../tower/template/fq12over6over2/fq2.go.tmpl | 22 +-- .../fq12over6over2/tests/fq12.go.tmpl | 8 +- 46 files changed, 773 insertions(+), 727 deletions(-) diff --git a/ecc/bls12-377/internal/fptower/e12.go b/ecc/bls12-377/internal/fptower/e12.go index 2769e65952..723902e2bf 100644 --- a/ecc/bls12-377/internal/fptower/e12.go +++ b/ecc/bls12-377/internal/fptower/e12.go @@ -415,9 +415,9 @@ func BatchInvertE12(a []E12) []E12 { return res } -// Exp sets z=x**k and returns it +// Exp sets z=xᵏ (mod q¹²) and returns it // uses 2-bits windowed method -func (z *E12) Exp(x *E12, k big.Int) *E12 { +func (z *E12) Exp(x E12, k *big.Int) *E12 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } @@ -426,20 +426,20 @@ func (z *E12) Exp(x *E12, k big.Int) *E12 { if k.Sign() == -1 { // negative k, we invert // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + x.Inverse(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var res E12 var ops [3]E12 res.SetOne() - ops[0].Set(x) + ops[0].Set(&x) ops[1].Square(&ops[0]) ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) @@ -461,37 +461,37 @@ func (z *E12) Exp(x *E12, k big.Int) *E12 { return z } -// CyclotomicExp sets z=x**k and returns it +// CyclotomicExp sets z=xᵏ (mod q¹²) and returns it // uses 2-NAF decomposition // x must be in the cyclotomic subgroup // TODO: use a windowed method -func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 { +func (z *E12) CyclotomicExp(x E12, k *big.Int) *E12 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } e := k if k.Sign() == -1 { - // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // negative k, we invert (=conjugate) + // if k < 0: xᵏ (mod q¹²) == (x⁻¹)ᵏ (mod q¹²) + x.Conjugate(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var res, xInv E12 - xInv.InverseUnitary(x) + xInv.InverseUnitary(&x) res.SetOne() eNAF := make([]int8, e.BitLen()+3) - n := ecc.NafDecomposition(&e, eNAF[:]) + n := ecc.NafDecomposition(e, eNAF[:]) for i := n - 1; i >= 0; i-- { res.CyclotomicSquare(&res) if eNAF[i] == 1 { - res.Mul(&res, x) + res.Mul(&res, &x) } else if eNAF[i] == -1 { res.Mul(&res, &xInv) } @@ -500,27 +500,27 @@ func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 { return z } -// ExpGLV sets z=x**k and returns it +// ExpGLV sets z=xᵏ (q¹²) and returns it // uses 2-dimensional GLV with 2-bits windowed method // x must be in GT // TODO: use 2-NAF // TODO: use higher dimensional decomposition -func (z *E12) ExpGLV(x *E12, k big.Int) *E12 { +func (z *E12) ExpGLV(x E12, k *big.Int) *E12 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } e := k if k.Sign() == -1 { - // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // negative k, we invert (=conjugate) + // if k < 0: xᵏ (mod q¹²) == (x⁻¹)ᵏ (mod q¹²) + x.Conjugate(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var table [15]E12 @@ -530,11 +530,11 @@ func (z *E12) ExpGLV(x *E12, k big.Int) *E12 { res.SetOne() // table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x - table[0].Set(x) - table[3].Frobenius(x) + table[0].Set(&x) + table[3].Frobenius(&x) - // split the scalar, modifies +-x, Frob(x) accordingly - s := ecc.SplitScalar(&e, &glvBasis) + // split the scalar, modifies ±x, Frob(x) accordingly + s := ecc.SplitScalar(e, &glvBasis) if s[0].Sign() == -1 { s[0].Neg(&s[0]) diff --git a/ecc/bls12-377/internal/fptower/e12_test.go b/ecc/bls12-377/internal/fptower/e12_test.go index 3a7306a034..6fe75b79f2 100644 --- a/ecc/bls12-377/internal/fptower/e12_test.go +++ b/ecc/bls12-377/internal/fptower/e12_test.go @@ -391,8 +391,8 @@ func TestE12Ops(t *testing.T) { e.Exp(e, k) e.ToBigIntRegular(&_e) - c.Exp(a, _e) - d.CyclotomicExp(a, _e) + c.Exp(*a, &_e) + d.CyclotomicExp(*a, &_e) return c.Equal(&d) }, @@ -405,7 +405,7 @@ func TestE12Ops(t *testing.T) { var b, c E12 q := fp.Modulus() b.Frobenius(a) - c.Exp(a, *q) + c.Exp(*a, q) return c.Equal(&b) }, genA, @@ -416,7 +416,7 @@ func TestE12Ops(t *testing.T) { var b, c E12 q := fp.Modulus() b.FrobeniusSquare(a) - c.Exp(a, *q).Exp(&c, *q) + c.Exp(*a, q).Exp(c, q) return c.Equal(&b) }, genA, diff --git a/ecc/bls12-377/internal/fptower/e2.go b/ecc/bls12-377/internal/fptower/e2.go index 32e5ac4bf9..fc300952ca 100644 --- a/ecc/bls12-377/internal/fptower/e2.go +++ b/ecc/bls12-377/internal/fptower/e2.go @@ -170,23 +170,23 @@ func (z *E2) Legendre() int { return n.Legendre() } -// Exp sets z=x**e and returns it -func (z *E2) Exp(x E2, e *big.Int) *E2 { - if e.IsUint64() && e.Uint64() == 0 { +// Exp sets z=xᵏ (mod q²) and returns it +func (z *E2) Exp(x E2, k *big.Int) *E2 { + if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } - k := e + e := k if k.Sign() == -1 { // negative k, we invert - // if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12) + // if k < 0: xᵏ (mod q²) == (x⁻¹)ᵏ (mod q²) x.Inverse(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - k = bigIntPool.Get().(*big.Int) - defer bigIntPool.Put(k) - k.Neg(k) + e = bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(k) } z.SetOne() diff --git a/ecc/bls12-377/pairing_test.go b/ecc/bls12-377/pairing_test.go index e678c58613..cfa761546c 100644 --- a/ecc/bls12-377/pairing_test.go +++ b/ecc/bls12-377/pairing_test.go @@ -76,9 +76,9 @@ func TestPairing(t *testing.T) { e.ToBigIntRegular(&_e) var b, c, d GT - b.Exp(&a, _e) - c.ExpGLV(&a, _e) - d.CyclotomicExp(&a, _e) + b.Exp(a, &_e) + c.ExpGLV(a, &_e) + d.CyclotomicExp(a, &_e) return b.Equal(&c) && c.Equal(&d) }, @@ -97,7 +97,7 @@ func TestPairing(t *testing.T) { Mul(&a, &b) c.Expt(&a).Expt(&c) - d.Exp(&a, xGen).Exp(&d, xGen) + d.Exp(a, &xGen).Exp(d, &xGen) return c.Equal(&d) }, genA, @@ -124,9 +124,9 @@ func TestPairing(t *testing.T) { resa, _ = Pair([]G1Affine{ag1}, []G2Affine{g2GenAff}) resb, _ = Pair([]G1Affine{g1GenAff}, []G2Affine{bg2}) - resab.Exp(&res, ab) - resa.Exp(&resa, bbigint) - resb.Exp(&resb, abigint) + resab.Exp(res, &ab) + resa.Exp(resa, &bbigint) + resb.Exp(resb, &abigint) return resab.Equal(&resa) && resab.Equal(&resb) && !res.Equal(&zero) @@ -393,21 +393,21 @@ func BenchmarkExpGT(b *testing.B) { b.Run("Naive windowed Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.Exp(&a, _e) + a.Exp(a, &_e) } }) b.Run("2-NAF cyclotomic Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.CyclotomicExp(&a, _e) + a.CyclotomicExp(a, &_e) } }) b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.ExpGLV(&a, _e) + a.ExpGLV(a, &_e) } }) } diff --git a/ecc/bls12-378/internal/fptower/e12.go b/ecc/bls12-378/internal/fptower/e12.go index 27f6285fdc..0169ee5054 100644 --- a/ecc/bls12-378/internal/fptower/e12.go +++ b/ecc/bls12-378/internal/fptower/e12.go @@ -415,9 +415,9 @@ func BatchInvertE12(a []E12) []E12 { return res } -// Exp sets z=x**k and returns it +// Exp sets z=xᵏ (mod q¹²) and returns it // uses 2-bits windowed method -func (z *E12) Exp(x *E12, k big.Int) *E12 { +func (z *E12) Exp(x E12, k *big.Int) *E12 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } @@ -426,20 +426,20 @@ func (z *E12) Exp(x *E12, k big.Int) *E12 { if k.Sign() == -1 { // negative k, we invert // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + x.Inverse(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var res E12 var ops [3]E12 res.SetOne() - ops[0].Set(x) + ops[0].Set(&x) ops[1].Square(&ops[0]) ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) @@ -461,37 +461,37 @@ func (z *E12) Exp(x *E12, k big.Int) *E12 { return z } -// CyclotomicExp sets z=x**k and returns it +// CyclotomicExp sets z=xᵏ (mod q¹²) and returns it // uses 2-NAF decomposition // x must be in the cyclotomic subgroup // TODO: use a windowed method -func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 { +func (z *E12) CyclotomicExp(x E12, k *big.Int) *E12 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } e := k if k.Sign() == -1 { - // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // negative k, we invert (=conjugate) + // if k < 0: xᵏ (mod q¹²) == (x⁻¹)ᵏ (mod q¹²) + x.Conjugate(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var res, xInv E12 - xInv.InverseUnitary(x) + xInv.InverseUnitary(&x) res.SetOne() eNAF := make([]int8, e.BitLen()+3) - n := ecc.NafDecomposition(&e, eNAF[:]) + n := ecc.NafDecomposition(e, eNAF[:]) for i := n - 1; i >= 0; i-- { res.CyclotomicSquare(&res) if eNAF[i] == 1 { - res.Mul(&res, x) + res.Mul(&res, &x) } else if eNAF[i] == -1 { res.Mul(&res, &xInv) } @@ -500,27 +500,27 @@ func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 { return z } -// ExpGLV sets z=x**k and returns it +// ExpGLV sets z=xᵏ (q¹²) and returns it // uses 2-dimensional GLV with 2-bits windowed method // x must be in GT // TODO: use 2-NAF // TODO: use higher dimensional decomposition -func (z *E12) ExpGLV(x *E12, k big.Int) *E12 { +func (z *E12) ExpGLV(x E12, k *big.Int) *E12 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } e := k if k.Sign() == -1 { - // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // negative k, we invert (=conjugate) + // if k < 0: xᵏ (mod q¹²) == (x⁻¹)ᵏ (mod q¹²) + x.Conjugate(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var table [15]E12 @@ -530,11 +530,11 @@ func (z *E12) ExpGLV(x *E12, k big.Int) *E12 { res.SetOne() // table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x - table[0].Set(x) - table[3].Frobenius(x) + table[0].Set(&x) + table[3].Frobenius(&x) - // split the scalar, modifies +-x, Frob(x) accordingly - s := ecc.SplitScalar(&e, &glvBasis) + // split the scalar, modifies ±x, Frob(x) accordingly + s := ecc.SplitScalar(e, &glvBasis) if s[0].Sign() == -1 { s[0].Neg(&s[0]) diff --git a/ecc/bls12-378/internal/fptower/e12_test.go b/ecc/bls12-378/internal/fptower/e12_test.go index 66eef6d80b..2ce5f01057 100644 --- a/ecc/bls12-378/internal/fptower/e12_test.go +++ b/ecc/bls12-378/internal/fptower/e12_test.go @@ -391,8 +391,8 @@ func TestE12Ops(t *testing.T) { e.Exp(e, k) e.ToBigIntRegular(&_e) - c.Exp(a, _e) - d.CyclotomicExp(a, _e) + c.Exp(*a, &_e) + d.CyclotomicExp(*a, &_e) return c.Equal(&d) }, @@ -405,7 +405,7 @@ func TestE12Ops(t *testing.T) { var b, c E12 q := fp.Modulus() b.Frobenius(a) - c.Exp(a, *q) + c.Exp(*a, q) return c.Equal(&b) }, genA, @@ -416,7 +416,7 @@ func TestE12Ops(t *testing.T) { var b, c E12 q := fp.Modulus() b.FrobeniusSquare(a) - c.Exp(a, *q).Exp(&c, *q) + c.Exp(*a, q).Exp(c, q) return c.Equal(&b) }, genA, diff --git a/ecc/bls12-378/internal/fptower/e2.go b/ecc/bls12-378/internal/fptower/e2.go index 0223d0aac8..55fd82e0b5 100644 --- a/ecc/bls12-378/internal/fptower/e2.go +++ b/ecc/bls12-378/internal/fptower/e2.go @@ -170,23 +170,23 @@ func (z *E2) Legendre() int { return n.Legendre() } -// Exp sets z=x**e and returns it -func (z *E2) Exp(x E2, e *big.Int) *E2 { - if e.IsUint64() && e.Uint64() == 0 { +// Exp sets z=xᵏ (mod q²) and returns it +func (z *E2) Exp(x E2, k *big.Int) *E2 { + if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } - k := e + e := k if k.Sign() == -1 { // negative k, we invert - // if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12) + // if k < 0: xᵏ (mod q²) == (x⁻¹)ᵏ (mod q²) x.Inverse(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - k = bigIntPool.Get().(*big.Int) - defer bigIntPool.Put(k) - k.Neg(k) + e = bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(k) } z.SetOne() diff --git a/ecc/bls12-378/pairing_test.go b/ecc/bls12-378/pairing_test.go index c0a4e66eae..020ace09a2 100644 --- a/ecc/bls12-378/pairing_test.go +++ b/ecc/bls12-378/pairing_test.go @@ -76,9 +76,9 @@ func TestPairing(t *testing.T) { e.ToBigIntRegular(&_e) var b, c, d GT - b.Exp(&a, _e) - c.ExpGLV(&a, _e) - d.CyclotomicExp(&a, _e) + b.Exp(a, &_e) + c.ExpGLV(a, &_e) + d.CyclotomicExp(a, &_e) return b.Equal(&c) && c.Equal(&d) }, @@ -97,7 +97,7 @@ func TestPairing(t *testing.T) { Mul(&a, &b) c.Expt(&a).Expt(&c) - d.Exp(&a, xGen).Exp(&d, xGen) + d.Exp(a, &xGen).Exp(d, &xGen) return c.Equal(&d) }, genA, @@ -124,9 +124,9 @@ func TestPairing(t *testing.T) { resa, _ = Pair([]G1Affine{ag1}, []G2Affine{g2GenAff}) resb, _ = Pair([]G1Affine{g1GenAff}, []G2Affine{bg2}) - resab.Exp(&res, ab) - resa.Exp(&resa, bbigint) - resb.Exp(&resb, abigint) + resab.Exp(res, &ab) + resa.Exp(resa, &bbigint) + resb.Exp(resb, &abigint) return resab.Equal(&resa) && resab.Equal(&resb) && !res.Equal(&zero) @@ -393,21 +393,21 @@ func BenchmarkExpGT(b *testing.B) { b.Run("Naive windowed Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.Exp(&a, _e) + a.Exp(a, &_e) } }) b.Run("2-NAF cyclotomic Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.CyclotomicExp(&a, _e) + a.CyclotomicExp(a, &_e) } }) b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.ExpGLV(&a, _e) + a.ExpGLV(a, &_e) } }) } diff --git a/ecc/bls12-381/internal/fptower/e12.go b/ecc/bls12-381/internal/fptower/e12.go index 7ba4aa36ec..0eaa9f3df4 100644 --- a/ecc/bls12-381/internal/fptower/e12.go +++ b/ecc/bls12-381/internal/fptower/e12.go @@ -415,9 +415,9 @@ func BatchInvertE12(a []E12) []E12 { return res } -// Exp sets z=x**k and returns it +// Exp sets z=xᵏ (mod q¹²) and returns it // uses 2-bits windowed method -func (z *E12) Exp(x *E12, k big.Int) *E12 { +func (z *E12) Exp(x E12, k *big.Int) *E12 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } @@ -426,20 +426,20 @@ func (z *E12) Exp(x *E12, k big.Int) *E12 { if k.Sign() == -1 { // negative k, we invert // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + x.Inverse(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var res E12 var ops [3]E12 res.SetOne() - ops[0].Set(x) + ops[0].Set(&x) ops[1].Square(&ops[0]) ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) @@ -461,37 +461,37 @@ func (z *E12) Exp(x *E12, k big.Int) *E12 { return z } -// CyclotomicExp sets z=x**k and returns it +// CyclotomicExp sets z=xᵏ (mod q¹²) and returns it // uses 2-NAF decomposition // x must be in the cyclotomic subgroup // TODO: use a windowed method -func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 { +func (z *E12) CyclotomicExp(x E12, k *big.Int) *E12 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } e := k if k.Sign() == -1 { - // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // negative k, we invert (=conjugate) + // if k < 0: xᵏ (mod q¹²) == (x⁻¹)ᵏ (mod q¹²) + x.Conjugate(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var res, xInv E12 - xInv.InverseUnitary(x) + xInv.InverseUnitary(&x) res.SetOne() eNAF := make([]int8, e.BitLen()+3) - n := ecc.NafDecomposition(&e, eNAF[:]) + n := ecc.NafDecomposition(e, eNAF[:]) for i := n - 1; i >= 0; i-- { res.CyclotomicSquare(&res) if eNAF[i] == 1 { - res.Mul(&res, x) + res.Mul(&res, &x) } else if eNAF[i] == -1 { res.Mul(&res, &xInv) } @@ -500,27 +500,27 @@ func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 { return z } -// ExpGLV sets z=x**k and returns it +// ExpGLV sets z=xᵏ (q¹²) and returns it // uses 2-dimensional GLV with 2-bits windowed method // x must be in GT // TODO: use 2-NAF // TODO: use higher dimensional decomposition -func (z *E12) ExpGLV(x *E12, k big.Int) *E12 { +func (z *E12) ExpGLV(x E12, k *big.Int) *E12 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } e := k if k.Sign() == -1 { - // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // negative k, we invert (=conjugate) + // if k < 0: xᵏ (mod q¹²) == (x⁻¹)ᵏ (mod q¹²) + x.Conjugate(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var table [15]E12 @@ -530,11 +530,11 @@ func (z *E12) ExpGLV(x *E12, k big.Int) *E12 { res.SetOne() // table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x - table[0].Set(x) - table[3].Frobenius(x) + table[0].Set(&x) + table[3].Frobenius(&x) - // split the scalar, modifies +-x, Frob(x) accordingly - s := ecc.SplitScalar(&e, &glvBasis) + // split the scalar, modifies ±x, Frob(x) accordingly + s := ecc.SplitScalar(e, &glvBasis) if s[0].Sign() == -1 { s[0].Neg(&s[0]) diff --git a/ecc/bls12-381/internal/fptower/e12_test.go b/ecc/bls12-381/internal/fptower/e12_test.go index 46498b1ea5..0d5f9cd4ae 100644 --- a/ecc/bls12-381/internal/fptower/e12_test.go +++ b/ecc/bls12-381/internal/fptower/e12_test.go @@ -391,8 +391,8 @@ func TestE12Ops(t *testing.T) { e.Exp(e, k) e.ToBigIntRegular(&_e) - c.Exp(a, _e) - d.CyclotomicExp(a, _e) + c.Exp(*a, &_e) + d.CyclotomicExp(*a, &_e) return c.Equal(&d) }, @@ -405,7 +405,7 @@ func TestE12Ops(t *testing.T) { var b, c E12 q := fp.Modulus() b.Frobenius(a) - c.Exp(a, *q) + c.Exp(*a, q) return c.Equal(&b) }, genA, @@ -416,7 +416,7 @@ func TestE12Ops(t *testing.T) { var b, c E12 q := fp.Modulus() b.FrobeniusSquare(a) - c.Exp(a, *q).Exp(&c, *q) + c.Exp(*a, q).Exp(c, q) return c.Equal(&b) }, genA, diff --git a/ecc/bls12-381/internal/fptower/e2.go b/ecc/bls12-381/internal/fptower/e2.go index a553e7fba5..6dcb1aca0e 100644 --- a/ecc/bls12-381/internal/fptower/e2.go +++ b/ecc/bls12-381/internal/fptower/e2.go @@ -170,23 +170,23 @@ func (z *E2) Legendre() int { return n.Legendre() } -// Exp sets z=x**e and returns it -func (z *E2) Exp(x E2, e *big.Int) *E2 { - if e.IsUint64() && e.Uint64() == 0 { +// Exp sets z=xᵏ (mod q²) and returns it +func (z *E2) Exp(x E2, k *big.Int) *E2 { + if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } - k := e + e := k if k.Sign() == -1 { // negative k, we invert - // if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12) + // if k < 0: xᵏ (mod q²) == (x⁻¹)ᵏ (mod q²) x.Inverse(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - k = bigIntPool.Get().(*big.Int) - defer bigIntPool.Put(k) - k.Neg(k) + e = bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(k) } z.SetOne() diff --git a/ecc/bls12-381/pairing_test.go b/ecc/bls12-381/pairing_test.go index 4e7d4e4d7c..5c45046b1c 100644 --- a/ecc/bls12-381/pairing_test.go +++ b/ecc/bls12-381/pairing_test.go @@ -76,9 +76,9 @@ func TestPairing(t *testing.T) { e.ToBigIntRegular(&_e) var b, c, d GT - b.Exp(&a, _e) - c.ExpGLV(&a, _e) - d.CyclotomicExp(&a, _e) + b.Exp(a, &_e) + c.ExpGLV(a, &_e) + d.CyclotomicExp(a, &_e) return b.Equal(&c) && c.Equal(&d) }, @@ -97,7 +97,7 @@ func TestPairing(t *testing.T) { Mul(&a, &b) c.Expt(&a).Expt(&c) - d.Exp(&a, xGen).Exp(&d, xGen) + d.Exp(a, &xGen).Exp(d, &xGen) return c.Equal(&d) }, genA, @@ -124,9 +124,9 @@ func TestPairing(t *testing.T) { resa, _ = Pair([]G1Affine{ag1}, []G2Affine{g2GenAff}) resb, _ = Pair([]G1Affine{g1GenAff}, []G2Affine{bg2}) - resab.Exp(&res, ab) - resa.Exp(&resa, bbigint) - resb.Exp(&resb, abigint) + resab.Exp(res, &ab) + resa.Exp(resa, &bbigint) + resb.Exp(resb, &abigint) return resab.Equal(&resa) && resab.Equal(&resb) && !res.Equal(&zero) @@ -393,21 +393,21 @@ func BenchmarkExpGT(b *testing.B) { b.Run("Naive windowed Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.Exp(&a, _e) + a.Exp(a, &_e) } }) b.Run("2-NAF cyclotomic Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.CyclotomicExp(&a, _e) + a.CyclotomicExp(a, &_e) } }) b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.ExpGLV(&a, _e) + a.ExpGLV(a, &_e) } }) } diff --git a/ecc/bls24-315/internal/fptower/e12.go b/ecc/bls24-315/internal/fptower/e12.go index 535122131d..faa9d387c8 100644 --- a/ecc/bls24-315/internal/fptower/e12.go +++ b/ecc/bls24-315/internal/fptower/e12.go @@ -240,23 +240,49 @@ func BatchInvertE12(a []E12) []E12 { return res } -// Exp sets z=x**e and returns it -func (z *E12) Exp(x *E12, e big.Int) *E12 { +// Exp sets z=xᵏ (mod q¹²) and returns it +// uses 2-bits windowed method +func (z *E12) Exp(x E12, k *big.Int) *E12 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(&x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(k) + } + var res E12 + var ops [3]E12 + res.SetOne() + ops[0].Set(&x) + ops[1].Square(&ops[0]) + ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) + b := e.Bytes() for i := range b { w := b[i] - mask := byte(0x80) - for j := 7; j >= 0; j-- { - res.Square(&res) - if (w&mask)>>j != 0 { - res.Mul(&res, x) + mask := byte(0xc0) + for j := 0; j < 4; j++ { + res.Square(&res).Square(&res) + c := (w & mask) >> (6 - 2*j) + if c != 0 { + res.Mul(&res, &ops[c-1]) } - mask = mask >> 1 + mask = mask >> 2 } } z.Set(&res) + return z } diff --git a/ecc/bls24-315/internal/fptower/e12_test.go b/ecc/bls24-315/internal/fptower/e12_test.go index ee431c5bc0..c4cec4957f 100644 --- a/ecc/bls24-315/internal/fptower/e12_test.go +++ b/ecc/bls24-315/internal/fptower/e12_test.go @@ -249,6 +249,6 @@ func BenchmarkE12ExpBySeed(b *testing.B) { _, _ = a.SetRandom() b.ResetTimer() for i := 0; i < b.N; i++ { - a.Exp(&a, seed).Conjugate(&a) + a.Exp(a, &seed).Conjugate(&a) } } diff --git a/ecc/bls24-315/internal/fptower/e2.go b/ecc/bls24-315/internal/fptower/e2.go index 774c4c1d48..de62535878 100644 --- a/ecc/bls24-315/internal/fptower/e2.go +++ b/ecc/bls24-315/internal/fptower/e2.go @@ -163,23 +163,23 @@ func (z *E2) Legendre() int { return n.Legendre() } -// Exp sets z=x**e and returns it -func (z *E2) Exp(x E2, e *big.Int) *E2 { - if e.IsUint64() && e.Uint64() == 0 { +// Exp sets z=xᵏ (mod q²) and returns it +func (z *E2) Exp(x E2, k *big.Int) *E2 { + if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } - k := e + e := k if k.Sign() == -1 { // negative k, we invert - // if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12) + // if k < 0: xᵏ (mod q²) == (x⁻¹)ᵏ (mod q²) x.Inverse(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - k = bigIntPool.Get().(*big.Int) - defer bigIntPool.Put(k) - k.Neg(k) + e = bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(k) } z.SetOne() diff --git a/ecc/bls24-315/internal/fptower/e24.go b/ecc/bls24-315/internal/fptower/e24.go index a935af6e11..043b85e0da 100644 --- a/ecc/bls24-315/internal/fptower/e24.go +++ b/ecc/bls24-315/internal/fptower/e24.go @@ -153,25 +153,25 @@ func (z *E24) CyclotomicSquareCompressed(x *E24) *E24 { var t [7]E4 - // t0 = g1^2 + // t0 = g1² t[0].Square(&x.D0.C1) - // t1 = g5^2 + // t1 = g5² t[1].Square(&x.D1.C2) // t5 = g1 + g5 t[5].Add(&x.D0.C1, &x.D1.C2) - // t2 = (g1 + g5)^2 + // t2 = (g1 + g5)² t[2].Square(&t[5]) - // t3 = g1^2 + g5^2 + // t3 = g1² + g5² t[3].Add(&t[0], &t[1]) // t5 = 2 * g1 * g5 t[5].Sub(&t[2], &t[3]) // t6 = g3 + g2 t[6].Add(&x.D1.C0, &x.D0.C2) - // t3 = (g3 + g2)^2 + // t3 = (g3 + g2)² t[3].Square(&t[6]) - // t2 = g3^2 + // t2 = g3² t[2].Square(&x.D1.C0) // t6 = 2 * nr * g1 * g5 @@ -182,33 +182,33 @@ func (z *E24) CyclotomicSquareCompressed(x *E24) *E24 { // z3 = 6 * nr * g1 * g5 + 2 * g3 z.D1.C0.Add(&t[5], &t[6]) - // t4 = nr * g5^2 + // t4 = nr * g5² t[4].MulByNonResidue(&t[1]) - // t5 = nr * g5^2 + g1^2 + // t5 = nr * g5² + g1² t[5].Add(&t[0], &t[4]) - // t6 = nr * g5^2 + g1^2 - g2 + // t6 = nr * g5² + g1² - g2 t[6].Sub(&t[5], &x.D0.C2) - // t1 = g2^2 + // t1 = g2² t[1].Square(&x.D0.C2) - // t6 = 2 * nr * g5^2 + 2 * g1^2 - 2*g2 + // t6 = 2 * nr * g5² + 2 * g1² - 2*g2 t[6].Double(&t[6]) - // z2 = 3 * nr * g5^2 + 3 * g1^2 - 2*g2 + // z2 = 3 * nr * g5² + 3 * g1² - 2*g2 z.D0.C2.Add(&t[6], &t[5]) - // t4 = nr * g2^2 + // t4 = nr * g2² t[4].MulByNonResidue(&t[1]) - // t5 = g3^2 + nr * g2^2 + // t5 = g3² + nr * g2² t[5].Add(&t[2], &t[4]) - // t6 = g3^2 + nr * g2^2 - g1 + // t6 = g3² + nr * g2² - g1 t[6].Sub(&t[5], &x.D0.C1) - // t6 = 2 * g3^2 + 2 * nr * g2^2 - 2 * g1 + // t6 = 2 * g3² + 2 * nr * g2² - 2 * g1 t[6].Double(&t[6]) - // z1 = 3 * g3^2 + 3 * nr * g2^2 - 2 * g1 + // z1 = 3 * g3² + 3 * nr * g2² - 2 * g1 z.D0.C1.Add(&t[6], &t[5]) - // t0 = g2^2 + g3^2 + // t0 = g2² + g3² t[0].Add(&t[2], &t[1]) // t5 = 2 * g3 * g2 t[5].Sub(&t[3], &t[0]) @@ -229,13 +229,13 @@ func (z *E24) DecompressKarabina(x *E24) *E24 { var one E4 one.SetOne() - // t0 = g1^2 + // t0 = g1² t[0].Square(&x.D0.C1) - // t1 = 3 * g1^2 - 2 * g2 + // t1 = 3 * g1² - 2 * g2 t[1].Sub(&t[0], &x.D0.C2). Double(&t[1]). Add(&t[1], &t[0]) - // t0 = E * g5^2 + t1 + // t0 = E * g5² + t1 t[2].Square(&x.D1.C2) t[0].MulByNonResidue(&t[2]). Add(&t[0], &t[1]) @@ -248,14 +248,14 @@ func (z *E24) DecompressKarabina(x *E24) *E24 { // t1 = g2 * g1 t[1].Mul(&x.D0.C2, &x.D0.C1) - // t2 = 2 * g4^2 - 3 * g2 * g1 + // t2 = 2 * g4² - 3 * g2 * g1 t[2].Square(&x.D1.C1). Sub(&t[2], &t[1]). Double(&t[2]). Sub(&t[2], &t[1]) // t1 = g3 * g5 t[1].Mul(&x.D1.C0, &x.D1.C2) - // c_0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1 + // c₀ = E * (2 * g4² + g3 * g5 - 3 * g2 * g1) + 1 t[2].Add(&t[2], &t[1]) z.D0.C0.MulByNonResidue(&t[2]). Add(&z.D0.C0, &one) @@ -284,13 +284,13 @@ func BatchDecompressKarabina(x []E24) []E24 { one.SetOne() for i := 0; i < n; i++ { - // t0 = g1^2 + // t0 = g1² t0[i].Square(&x[i].D0.C1) - // t1 = 3 * g1^2 - 2 * g2 + // t1 = 3 * g1² - 2 * g2 t1[i].Sub(&t0[i], &x[i].D0.C2). Double(&t1[i]). Add(&t1[i], &t0[i]) - // t0 = E * g5^2 + t1 + // t0 = E * g5² + t1 t2[i].Square(&x[i].D1.C2) t0[i].MulByNonResidue(&t2[i]). Add(&t0[i], &t1[i]) @@ -307,7 +307,7 @@ func BatchDecompressKarabina(x []E24) []E24 { // t1 = g2 * g1 t1[i].Mul(&x[i].D0.C2, &x[i].D0.C1) - // t2 = 2 * g4^2 - 3 * g2 * g1 + // t2 = 2 * g4² - 3 * g2 * g1 t2[i].Square(&x[i].D1.C1) t2[i].Sub(&t2[i], &t1[i]) t2[i].Double(&t2[i]) @@ -315,7 +315,7 @@ func BatchDecompressKarabina(x []E24) []E24 { // t1 = g3 * g5 t1[i].Mul(&x[i].D1.C0, &x[i].D1.C2) - // z0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1 + // z0 = E * (2 * g4² + g3 * g5 - 3 * g2 * g1) + 1 t2[i].Add(&t2[i], &t1[i]) x[i].D0.C0.MulByNonResidue(&t2[i]). Add(&x[i].D0.C0, &one) @@ -328,10 +328,10 @@ func BatchDecompressKarabina(x []E24) []E24 { // https://eprint.iacr.org/2009/565.pdf, 3.2 func (z *E24) CyclotomicSquare(x *E24) *E24 { - // x=(x0,x1,x2,x3,x4,x5,x6,x7) in E4^6 - // cyclosquare(x)=(3*x4^2*u + 3*x0^2 - 2*x0, - // 3*x2^2*u + 3*x3^2 - 2*x1, - // 3*x5^2*u + 3*x1^2 - 2*x2, + // x=(x0,x1,x2,x3,x4,x5,x6,x7) in E4⁶ + // cyclosquare(x)=(3*x4²*u + 3*x0² - 2*x0, + // 3*x2²*u + 3*x3² - 2*x1, + // 3*x5²*u + 3*x1² - 2*x2, // 6*x1*x5*u + 2*x3, // 6*x0*x4 + 2*x4, // 6*x2*x3 + 2*x5) @@ -348,9 +348,9 @@ func (z *E24) CyclotomicSquare(x *E24) *E24 { t[5].Square(&x.D0.C1) t[8].Add(&x.D1.C2, &x.D0.C1).Square(&t[8]).Sub(&t[8], &t[4]).Sub(&t[8], &t[5]).MulByNonResidue(&t[8]) // 2*x5*x1*u - t[0].MulByNonResidue(&t[0]).Add(&t[0], &t[1]) // x4^2*u + x0^2 - t[2].MulByNonResidue(&t[2]).Add(&t[2], &t[3]) // x2^2*u + x3^2 - t[4].MulByNonResidue(&t[4]).Add(&t[4], &t[5]) // x5^2*u + x1^2 + t[0].MulByNonResidue(&t[0]).Add(&t[0], &t[1]) // x4²*u + x0² + t[2].MulByNonResidue(&t[2]).Add(&t[2], &t[3]) // x2²*u + x3² + t[4].MulByNonResidue(&t[4]).Add(&t[4], &t[5]) // x5²*u + x1² z.D0.C0.Sub(&t[0], &x.D0.C0).Double(&z.D0.C0).Add(&z.D0.C0, &t[0]) z.D0.C1.Sub(&t[2], &x.D0.C1).Double(&z.D0.C1).Add(&z.D0.C1, &t[2]) @@ -413,9 +413,9 @@ func BatchInvertE24(a []E24) []E24 { return res } -// Exp sets z=x**k and returns it +// Exp sets z=xᵏ (mod q²⁴) and returns it // uses 2-bits windowed method -func (z *E24) Exp(x *E24, k big.Int) *E24 { +func (z *E24) Exp(x E24, k *big.Int) *E24 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } @@ -424,20 +424,20 @@ func (z *E24) Exp(x *E24, k big.Int) *E24 { if k.Sign() == -1 { // negative k, we invert // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + x.Inverse(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var res E24 var ops [3]E24 res.SetOne() - ops[0].Set(x) + ops[0].Set(&x) ops[1].Square(&ops[0]) ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) @@ -459,37 +459,37 @@ func (z *E24) Exp(x *E24, k big.Int) *E24 { return z } -// CyclotomicExp sets z=x**k and returns it +// CyclotomicExp sets z=xᵏ (mod q²⁴) and returns it // uses 2-NAF decomposition // x must be in the cyclotomic subgroup // TODO: use a windowed method -func (z *E24) CyclotomicExp(x *E24, k big.Int) *E24 { +func (z *E24) CyclotomicExp(x E24, k *big.Int) *E24 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } e := k if k.Sign() == -1 { - // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // negative k, we invert (=conjugate) + // if k < 0: xᵏ (mod q²⁴) == (x⁻¹)ᵏ (mod q²⁴) + x.Conjugate(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var res, xInv E24 - xInv.InverseUnitary(x) + xInv.InverseUnitary(&x) res.SetOne() eNAF := make([]int8, e.BitLen()+3) - n := ecc.NafDecomposition(&e, eNAF[:]) + n := ecc.NafDecomposition(e, eNAF[:]) for i := n - 1; i >= 0; i-- { res.CyclotomicSquare(&res) if eNAF[i] == 1 { - res.Mul(&res, x) + res.Mul(&res, &x) } else if eNAF[i] == -1 { res.Mul(&res, &xInv) } @@ -498,12 +498,12 @@ func (z *E24) CyclotomicExp(x *E24, k big.Int) *E24 { return z } -// ExpGLV sets z=x**k and returns it +// ExpGLV sets z=xᵏ (q²⁴) and returns it // uses 2-dimensional GLV with 2-bits windowed method // x must be in GT // TODO: use 2-NAF // TODO: use higher dimensional decomposition -func (z *E24) ExpGLV(x *E24, k big.Int) *E24 { +func (z *E24) ExpGLV(x E24, k *big.Int) *E24 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } @@ -511,14 +511,14 @@ func (z *E24) ExpGLV(x *E24, k big.Int) *E24 { e := k if k.Sign() == -1 { // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // if k < 0: xᵏ (mod q²⁴) == (x⁻¹)ᵏ (mod q²⁴) + x.Conjugate(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var table [15]E24 @@ -528,11 +528,11 @@ func (z *E24) ExpGLV(x *E24, k big.Int) *E24 { res.SetOne() // table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x - table[0].Set(x) - table[3].Frobenius(x) + table[0].Set(&x) + table[3].Frobenius(&x) - // split the scalar, modifies +-x, Frob(x) accordingly - s := ecc.SplitScalar(&e, &glvBasis) + // split the scalar, modifies ±x, Frob(x) accordingly + s := ecc.SplitScalar(e, &glvBasis) if s[0].Sign() == -1 { s[0].Neg(&s[0]) @@ -793,10 +793,10 @@ func (z *E24) IsInSubGroup() bool { // CompressTorus GT/E24 element to half its size // z must be in the cyclotomic subgroup -// i.e. z^(p^4-p^2+1)=1 +// i.e. z^(p⁴-p²+1)=1 // e.g. GT // "COMPRESSION IN FINITE FIELDS AND TORUS-BASED CRYPTOGRAPHY", K. RUBIN AND A. SILVERBERG -// z.C1 == 0 only when z \in {-1,1} +// z.C1 == 0 only when z ∈ {-1,1} func (z *E24) CompressTorus() (E12, error) { if z.D1.IsZero() { diff --git a/ecc/bls24-315/internal/fptower/e24_test.go b/ecc/bls24-315/internal/fptower/e24_test.go index 227b78300e..ff70344d14 100644 --- a/ecc/bls24-315/internal/fptower/e24_test.go +++ b/ecc/bls24-315/internal/fptower/e24_test.go @@ -422,8 +422,8 @@ func TestE24Ops(t *testing.T) { e.Exp(e, k) e.ToBigIntRegular(&_e) - c.Exp(a, _e) - d.CyclotomicExp(a, _e) + c.Exp(*a, &_e) + d.CyclotomicExp(*a, &_e) return c.Equal(&d) }, @@ -437,7 +437,7 @@ func TestE24Ops(t *testing.T) { q := fp.Modulus() b.Frobenius(a) c.Set(a) - c.Exp(&c, *q) + c.Exp(c, q) return c.Equal(&b) }, genA, @@ -448,7 +448,7 @@ func TestE24Ops(t *testing.T) { var b, c E24 q := fp.Modulus() b.FrobeniusSquare(a) - c.Exp(a, *q).Exp(&c, *q) + c.Exp(*a, q).Exp(c, q) return c.Equal(&b) }, genA, @@ -459,7 +459,7 @@ func TestE24Ops(t *testing.T) { var b, c E24 q := fp.Modulus() b.FrobeniusQuad(a) - c.Exp(a, *q).Exp(&c, *q).Exp(&c, *q).Exp(&c, *q) + c.Exp(*a, q).Exp(c, q).Exp(c, q).Exp(c, q) return c.Equal(&b) }, genA, diff --git a/ecc/bls24-315/internal/fptower/e4.go b/ecc/bls24-315/internal/fptower/e4.go index d73f787dd1..34fe0659c4 100644 --- a/ecc/bls24-315/internal/fptower/e4.go +++ b/ecc/bls24-315/internal/fptower/e4.go @@ -215,40 +215,37 @@ func (z *E4) Inverse(x *E4) *E4 { return z } -// Exp sets z=x**e and returns it -func (z *E4) Exp(x *E4, e big.Int) *E4 { - if e.IsUint64() && e.Uint64() == 0 { +// Exp sets z=xᵏ (mod q⁴) and returns it +func (z *E4) Exp(x E4, k *big.Int) *E4 { + if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } - k := e + e := k if k.Sign() == -1 { // negative k, we invert - // if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12) - x.Inverse(x) + // if k < 0: xᵏ (mod q⁴) == (x⁻¹)ᵏ (mod q⁴) + x.Inverse(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - k = *bigIntPool.Get().(*big.Int) - defer bigIntPool.Put(k) - k.Neg(&k) + e = bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(k) } - var res E4 - res.SetOne() + z.SetOne() b := e.Bytes() - for i := range b { + for i := 0; i < len(b); i++ { w := b[i] - mask := byte(0x80) - for j := 7; j >= 0; j-- { - res.Square(&res) - if (w&mask)>>j != 0 { - res.Mul(&res, x) + for j := 0; j < 8; j++ { + z.Square(z) + if (w & (0b10000000 >> j)) != 0 { + z.Mul(z, &x) } - mask = mask >> 1 } } - z.Set(&res) + return z } @@ -299,13 +296,13 @@ func (z *E4) Sqrt(x *E4) *E4 { var exp, one big.Int one.SetUint64(1) exp.Mul(q, q).Sub(&exp, &one).Rsh(&exp, 1) - d.Exp(&c, exp) + d.Exp(c, &exp) e.Mul(&d, &c).Inverse(&e) f.Mul(&d, &c).Square(&f) // computation exp.Rsh(&exp, 1) - b.Exp(x, exp) + b.Exp(*x, &exp) b.norm(&_b) o.SetOne() if _b.Equal(&o) { diff --git a/ecc/bls24-315/internal/fptower/e4_test.go b/ecc/bls24-315/internal/fptower/e4_test.go index d84267b2c2..a3a52e8f34 100644 --- a/ecc/bls24-315/internal/fptower/e4_test.go +++ b/ecc/bls24-315/internal/fptower/e4_test.go @@ -259,7 +259,7 @@ func TestE4Ops(t *testing.T) { var b, c E4 q := fp.Modulus() b.Frobenius(a) - c.Exp(a, *q) + c.Exp(*a, q) return c.Equal(&b) }, genA, diff --git a/ecc/bls24-315/pairing_test.go b/ecc/bls24-315/pairing_test.go index bc1d1383dc..e0102f8864 100644 --- a/ecc/bls24-315/pairing_test.go +++ b/ecc/bls24-315/pairing_test.go @@ -78,9 +78,9 @@ func TestPairing(t *testing.T) { e.ToBigIntRegular(&_e) var b, c, d GT - b.Exp(&a, _e) - c.ExpGLV(&a, _e) - d.CyclotomicExp(&a, _e) + b.Exp(a, &_e) + c.ExpGLV(a, &_e) + d.CyclotomicExp(a, &_e) return b.Equal(&c) && c.Equal(&d) }, @@ -99,7 +99,7 @@ func TestPairing(t *testing.T) { Mul(&a, &b) c.Expt(&a).Expt(&c) - d.Exp(&a, xGen).Exp(&d, xGen) + d.Exp(a, &xGen).Exp(d, &xGen) return c.Equal(&d) }, genA, @@ -126,9 +126,9 @@ func TestPairing(t *testing.T) { resa, _ = Pair([]G1Affine{ag1}, []G2Affine{g2GenAff}) resb, _ = Pair([]G1Affine{g1GenAff}, []G2Affine{bg2}) - resab.Exp(&res, ab) - resa.Exp(&resa, bbigint) - resb.Exp(&resb, abigint) + resab.Exp(res, &ab) + resa.Exp(resa, &bbigint) + resb.Exp(resb, &abigint) return resab.Equal(&resa) && resab.Equal(&resb) && !res.Equal(&zero) @@ -396,21 +396,21 @@ func BenchmarkExpGT(b *testing.B) { b.Run("Naive windowed Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.Exp(&a, _e) + a.Exp(a, &_e) } }) b.Run("2-NAF cyclotomic Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.CyclotomicExp(&a, _e) + a.CyclotomicExp(a, &_e) } }) b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.ExpGLV(&a, _e) + a.ExpGLV(a, &_e) } }) } diff --git a/ecc/bls24-317/internal/fptower/e12.go b/ecc/bls24-317/internal/fptower/e12.go index 315432f75b..785fea776d 100644 --- a/ecc/bls24-317/internal/fptower/e12.go +++ b/ecc/bls24-317/internal/fptower/e12.go @@ -240,23 +240,49 @@ func BatchInvertE12(a []E12) []E12 { return res } -// Exp sets z=x**e and returns it -func (z *E12) Exp(x *E12, e big.Int) *E12 { +// Exp sets z=xᵏ (mod q¹²) and returns it +// uses 2-bits windowed method +func (z *E12) Exp(x E12, k *big.Int) *E12 { + if k.IsUint64() && k.Uint64() == 0 { + return z.SetOne() + } + + e := k + if k.Sign() == -1 { + // negative k, we invert + // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) + x.Inverse(&x) + + // we negate k in a temp big.Int since + // Int.Bit(_) of k and -k is different + e = bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(k) + } + var res E12 + var ops [3]E12 + res.SetOne() + ops[0].Set(&x) + ops[1].Square(&ops[0]) + ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) + b := e.Bytes() for i := range b { w := b[i] - mask := byte(0x80) - for j := 7; j >= 0; j-- { - res.Square(&res) - if (w&mask)>>j != 0 { - res.Mul(&res, x) + mask := byte(0xc0) + for j := 0; j < 4; j++ { + res.Square(&res).Square(&res) + c := (w & mask) >> (6 - 2*j) + if c != 0 { + res.Mul(&res, &ops[c-1]) } - mask = mask >> 1 + mask = mask >> 2 } } z.Set(&res) + return z } diff --git a/ecc/bls24-317/internal/fptower/e12_test.go b/ecc/bls24-317/internal/fptower/e12_test.go index 19adb05dd2..76d4a6c9e0 100644 --- a/ecc/bls24-317/internal/fptower/e12_test.go +++ b/ecc/bls24-317/internal/fptower/e12_test.go @@ -248,6 +248,6 @@ func BenchmarkE12ExpBySeed(b *testing.B) { _, _ = a.SetRandom() b.ResetTimer() for i := 0; i < b.N; i++ { - a.Exp(&a, seed).Conjugate(&a) + a.Exp(a, &seed).Conjugate(&a) } } diff --git a/ecc/bls24-317/internal/fptower/e2.go b/ecc/bls24-317/internal/fptower/e2.go index f5d018924f..688d71776b 100644 --- a/ecc/bls24-317/internal/fptower/e2.go +++ b/ecc/bls24-317/internal/fptower/e2.go @@ -162,23 +162,23 @@ func (z *E2) Legendre() int { return n.Legendre() } -// Exp sets z=x**e and returns it -func (z *E2) Exp(x E2, e *big.Int) *E2 { - if e.IsUint64() && e.Uint64() == 0 { +// Exp sets z=xᵏ (mod q²) and returns it +func (z *E2) Exp(x E2, k *big.Int) *E2 { + if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } - k := e + e := k if k.Sign() == -1 { // negative k, we invert - // if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12) + // if k < 0: xᵏ (mod q²) == (x⁻¹)ᵏ (mod q²) x.Inverse(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - k = bigIntPool.Get().(*big.Int) - defer bigIntPool.Put(k) - k.Neg(k) + e = bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(k) } z.SetOne() diff --git a/ecc/bls24-317/internal/fptower/e24.go b/ecc/bls24-317/internal/fptower/e24.go index f25d66539d..c29a23021e 100644 --- a/ecc/bls24-317/internal/fptower/e24.go +++ b/ecc/bls24-317/internal/fptower/e24.go @@ -153,25 +153,25 @@ func (z *E24) CyclotomicSquareCompressed(x *E24) *E24 { var t [7]E4 - // t0 = g1^2 + // t0 = g1² t[0].Square(&x.D0.C1) - // t1 = g5^2 + // t1 = g5² t[1].Square(&x.D1.C2) // t5 = g1 + g5 t[5].Add(&x.D0.C1, &x.D1.C2) - // t2 = (g1 + g5)^2 + // t2 = (g1 + g5)² t[2].Square(&t[5]) - // t3 = g1^2 + g5^2 + // t3 = g1² + g5² t[3].Add(&t[0], &t[1]) // t5 = 2 * g1 * g5 t[5].Sub(&t[2], &t[3]) // t6 = g3 + g2 t[6].Add(&x.D1.C0, &x.D0.C2) - // t3 = (g3 + g2)^2 + // t3 = (g3 + g2)² t[3].Square(&t[6]) - // t2 = g3^2 + // t2 = g3² t[2].Square(&x.D1.C0) // t6 = 2 * nr * g1 * g5 @@ -182,33 +182,33 @@ func (z *E24) CyclotomicSquareCompressed(x *E24) *E24 { // z3 = 6 * nr * g1 * g5 + 2 * g3 z.D1.C0.Add(&t[5], &t[6]) - // t4 = nr * g5^2 + // t4 = nr * g5² t[4].MulByNonResidue(&t[1]) - // t5 = nr * g5^2 + g1^2 + // t5 = nr * g5² + g1² t[5].Add(&t[0], &t[4]) - // t6 = nr * g5^2 + g1^2 - g2 + // t6 = nr * g5² + g1² - g2 t[6].Sub(&t[5], &x.D0.C2) - // t1 = g2^2 + // t1 = g2² t[1].Square(&x.D0.C2) - // t6 = 2 * nr * g5^2 + 2 * g1^2 - 2*g2 + // t6 = 2 * nr * g5² + 2 * g1² - 2*g2 t[6].Double(&t[6]) - // z2 = 3 * nr * g5^2 + 3 * g1^2 - 2*g2 + // z2 = 3 * nr * g5² + 3 * g1² - 2*g2 z.D0.C2.Add(&t[6], &t[5]) - // t4 = nr * g2^2 + // t4 = nr * g2² t[4].MulByNonResidue(&t[1]) - // t5 = g3^2 + nr * g2^2 + // t5 = g3² + nr * g2² t[5].Add(&t[2], &t[4]) - // t6 = g3^2 + nr * g2^2 - g1 + // t6 = g3² + nr * g2² - g1 t[6].Sub(&t[5], &x.D0.C1) - // t6 = 2 * g3^2 + 2 * nr * g2^2 - 2 * g1 + // t6 = 2 * g3² + 2 * nr * g2² - 2 * g1 t[6].Double(&t[6]) - // z1 = 3 * g3^2 + 3 * nr * g2^2 - 2 * g1 + // z1 = 3 * g3² + 3 * nr * g2² - 2 * g1 z.D0.C1.Add(&t[6], &t[5]) - // t0 = g2^2 + g3^2 + // t0 = g2² + g3² t[0].Add(&t[2], &t[1]) // t5 = 2 * g3 * g2 t[5].Sub(&t[3], &t[0]) @@ -229,13 +229,13 @@ func (z *E24) DecompressKarabina(x *E24) *E24 { var one E4 one.SetOne() - // t0 = g1^2 + // t0 = g1² t[0].Square(&x.D0.C1) - // t1 = 3 * g1^2 - 2 * g2 + // t1 = 3 * g1² - 2 * g2 t[1].Sub(&t[0], &x.D0.C2). Double(&t[1]). Add(&t[1], &t[0]) - // t0 = E * g5^2 + t1 + // t0 = E * g5² + t1 t[2].Square(&x.D1.C2) t[0].MulByNonResidue(&t[2]). Add(&t[0], &t[1]) @@ -248,14 +248,14 @@ func (z *E24) DecompressKarabina(x *E24) *E24 { // t1 = g2 * g1 t[1].Mul(&x.D0.C2, &x.D0.C1) - // t2 = 2 * g4^2 - 3 * g2 * g1 + // t2 = 2 * g4² - 3 * g2 * g1 t[2].Square(&x.D1.C1). Sub(&t[2], &t[1]). Double(&t[2]). Sub(&t[2], &t[1]) // t1 = g3 * g5 t[1].Mul(&x.D1.C0, &x.D1.C2) - // c_0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1 + // c₀ = E * (2 * g4² + g3 * g5 - 3 * g2 * g1) + 1 t[2].Add(&t[2], &t[1]) z.D0.C0.MulByNonResidue(&t[2]). Add(&z.D0.C0, &one) @@ -284,13 +284,13 @@ func BatchDecompressKarabina(x []E24) []E24 { one.SetOne() for i := 0; i < n; i++ { - // t0 = g1^2 + // t0 = g1² t0[i].Square(&x[i].D0.C1) - // t1 = 3 * g1^2 - 2 * g2 + // t1 = 3 * g1² - 2 * g2 t1[i].Sub(&t0[i], &x[i].D0.C2). Double(&t1[i]). Add(&t1[i], &t0[i]) - // t0 = E * g5^2 + t1 + // t0 = E * g5² + t1 t2[i].Square(&x[i].D1.C2) t0[i].MulByNonResidue(&t2[i]). Add(&t0[i], &t1[i]) @@ -307,7 +307,7 @@ func BatchDecompressKarabina(x []E24) []E24 { // t1 = g2 * g1 t1[i].Mul(&x[i].D0.C2, &x[i].D0.C1) - // t2 = 2 * g4^2 - 3 * g2 * g1 + // t2 = 2 * g4² - 3 * g2 * g1 t2[i].Square(&x[i].D1.C1) t2[i].Sub(&t2[i], &t1[i]) t2[i].Double(&t2[i]) @@ -315,7 +315,7 @@ func BatchDecompressKarabina(x []E24) []E24 { // t1 = g3 * g5 t1[i].Mul(&x[i].D1.C0, &x[i].D1.C2) - // z0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1 + // z0 = E * (2 * g4² + g3 * g5 - 3 * g2 * g1) + 1 t2[i].Add(&t2[i], &t1[i]) x[i].D0.C0.MulByNonResidue(&t2[i]). Add(&x[i].D0.C0, &one) @@ -328,10 +328,10 @@ func BatchDecompressKarabina(x []E24) []E24 { // https://eprint.iacr.org/2009/565.pdf, 3.2 func (z *E24) CyclotomicSquare(x *E24) *E24 { - // x=(x0,x1,x2,x3,x4,x5,x6,x7) in E4^6 - // cyclosquare(x)=(3*x4^2*u + 3*x0^2 - 2*x0, - // 3*x2^2*u + 3*x3^2 - 2*x1, - // 3*x5^2*u + 3*x1^2 - 2*x2, + // x=(x0,x1,x2,x3,x4,x5,x6,x7) in E4⁶ + // cyclosquare(x)=(3*x4²*u + 3*x0² - 2*x0, + // 3*x2²*u + 3*x3² - 2*x1, + // 3*x5²*u + 3*x1² - 2*x2, // 6*x1*x5*u + 2*x3, // 6*x0*x4 + 2*x4, // 6*x2*x3 + 2*x5) @@ -348,9 +348,9 @@ func (z *E24) CyclotomicSquare(x *E24) *E24 { t[5].Square(&x.D0.C1) t[8].Add(&x.D1.C2, &x.D0.C1).Square(&t[8]).Sub(&t[8], &t[4]).Sub(&t[8], &t[5]).MulByNonResidue(&t[8]) // 2*x5*x1*u - t[0].MulByNonResidue(&t[0]).Add(&t[0], &t[1]) // x4^2*u + x0^2 - t[2].MulByNonResidue(&t[2]).Add(&t[2], &t[3]) // x2^2*u + x3^2 - t[4].MulByNonResidue(&t[4]).Add(&t[4], &t[5]) // x5^2*u + x1^2 + t[0].MulByNonResidue(&t[0]).Add(&t[0], &t[1]) // x4²*u + x0² + t[2].MulByNonResidue(&t[2]).Add(&t[2], &t[3]) // x2²*u + x3² + t[4].MulByNonResidue(&t[4]).Add(&t[4], &t[5]) // x5²*u + x1² z.D0.C0.Sub(&t[0], &x.D0.C0).Double(&z.D0.C0).Add(&z.D0.C0, &t[0]) z.D0.C1.Sub(&t[2], &x.D0.C1).Double(&z.D0.C1).Add(&z.D0.C1, &t[2]) @@ -413,9 +413,9 @@ func BatchInvertE24(a []E24) []E24 { return res } -// Exp sets z=x**k and returns it +// Exp sets z=xᵏ (mod q²⁴) and returns it // uses 2-bits windowed method -func (z *E24) Exp(x *E24, k big.Int) *E24 { +func (z *E24) Exp(x E24, k *big.Int) *E24 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } @@ -424,20 +424,20 @@ func (z *E24) Exp(x *E24, k big.Int) *E24 { if k.Sign() == -1 { // negative k, we invert // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + x.Inverse(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var res E24 var ops [3]E24 res.SetOne() - ops[0].Set(x) + ops[0].Set(&x) ops[1].Square(&ops[0]) ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) @@ -459,37 +459,37 @@ func (z *E24) Exp(x *E24, k big.Int) *E24 { return z } -// CyclotomicExp sets z=x**k and returns it +// CyclotomicExp sets z=xᵏ (mod q²⁴) and returns it // uses 2-NAF decomposition // x must be in the cyclotomic subgroup // TODO: use a windowed method -func (z *E24) CyclotomicExp(x *E24, k big.Int) *E24 { +func (z *E24) CyclotomicExp(x E24, k *big.Int) *E24 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } e := k if k.Sign() == -1 { - // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // negative k, we invert (=conjugate) + // if k < 0: xᵏ (mod q²⁴) == (x⁻¹)ᵏ (mod q²⁴) + x.Conjugate(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var res, xInv E24 - xInv.InverseUnitary(x) + xInv.InverseUnitary(&x) res.SetOne() eNAF := make([]int8, e.BitLen()+3) - n := ecc.NafDecomposition(&e, eNAF[:]) + n := ecc.NafDecomposition(e, eNAF[:]) for i := n - 1; i >= 0; i-- { res.CyclotomicSquare(&res) if eNAF[i] == 1 { - res.Mul(&res, x) + res.Mul(&res, &x) } else if eNAF[i] == -1 { res.Mul(&res, &xInv) } @@ -498,12 +498,12 @@ func (z *E24) CyclotomicExp(x *E24, k big.Int) *E24 { return z } -// ExpGLV sets z=x**k and returns it +// ExpGLV sets z=xᵏ (q²⁴) and returns it // uses 2-dimensional GLV with 2-bits windowed method // x must be in GT // TODO: use 2-NAF // TODO: use higher dimensional decomposition -func (z *E24) ExpGLV(x *E24, k big.Int) *E24 { +func (z *E24) ExpGLV(x E24, k *big.Int) *E24 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } @@ -511,14 +511,14 @@ func (z *E24) ExpGLV(x *E24, k big.Int) *E24 { e := k if k.Sign() == -1 { // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // if k < 0: xᵏ (mod q²⁴) == (x⁻¹)ᵏ (mod q²⁴) + x.Conjugate(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var table [15]E24 @@ -528,11 +528,11 @@ func (z *E24) ExpGLV(x *E24, k big.Int) *E24 { res.SetOne() // table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x - table[0].Set(x) - table[3].Frobenius(x) + table[0].Set(&x) + table[3].Frobenius(&x) - // split the scalar, modifies +-x, Frob(x) accordingly - s := ecc.SplitScalar(&e, &glvBasis) + // split the scalar, modifies ±x, Frob(x) accordingly + s := ecc.SplitScalar(e, &glvBasis) if s[0].Sign() == -1 { s[0].Neg(&s[0]) @@ -793,10 +793,10 @@ func (z *E24) IsInSubGroup() bool { // CompressTorus GT/E24 element to half its size // z must be in the cyclotomic subgroup -// i.e. z^(p^4-p^2+1)=1 +// i.e. z^(p⁴-p²+1)=1 // e.g. GT // "COMPRESSION IN FINITE FIELDS AND TORUS-BASED CRYPTOGRAPHY", K. RUBIN AND A. SILVERBERG -// z.C1 == 0 only when z \in {-1,1} +// z.C1 == 0 only when z ∈ {-1,1} func (z *E24) CompressTorus() (E12, error) { if z.D1.IsZero() { diff --git a/ecc/bls24-317/internal/fptower/e24_test.go b/ecc/bls24-317/internal/fptower/e24_test.go index b5c2dec411..6f235ca829 100644 --- a/ecc/bls24-317/internal/fptower/e24_test.go +++ b/ecc/bls24-317/internal/fptower/e24_test.go @@ -422,8 +422,8 @@ func TestE24Ops(t *testing.T) { e.Exp(e, k) e.ToBigIntRegular(&_e) - c.Exp(a, _e) - d.CyclotomicExp(a, _e) + c.Exp(*a, &_e) + d.CyclotomicExp(*a, &_e) return c.Equal(&d) }, @@ -437,7 +437,7 @@ func TestE24Ops(t *testing.T) { q := fp.Modulus() b.Frobenius(a) c.Set(a) - c.Exp(&c, *q) + c.Exp(c, q) return c.Equal(&b) }, genA, @@ -448,7 +448,7 @@ func TestE24Ops(t *testing.T) { var b, c E24 q := fp.Modulus() b.FrobeniusSquare(a) - c.Exp(a, *q).Exp(&c, *q) + c.Exp(*a, q).Exp(c, q) return c.Equal(&b) }, genA, @@ -459,7 +459,7 @@ func TestE24Ops(t *testing.T) { var b, c E24 q := fp.Modulus() b.FrobeniusQuad(a) - c.Exp(a, *q).Exp(&c, *q).Exp(&c, *q).Exp(&c, *q) + c.Exp(*a, q).Exp(c, q).Exp(c, q).Exp(c, q) return c.Equal(&b) }, genA, diff --git a/ecc/bls24-317/internal/fptower/e4.go b/ecc/bls24-317/internal/fptower/e4.go index bed1311c0d..63e6b37321 100644 --- a/ecc/bls24-317/internal/fptower/e4.go +++ b/ecc/bls24-317/internal/fptower/e4.go @@ -160,7 +160,7 @@ func (z *E4) MulByNonResidue(x *E4) *E4 { return z } -// MulByNonResidueInv mul x by (0,1)^{-1} +// MulByNonResidueInv mul x by (0,1)⁻¹ func (z *E4) MulByNonResidueInv(x *E4) *E4 { a := x.B1 var uInv E2 @@ -216,40 +216,37 @@ func (z *E4) Inverse(x *E4) *E4 { return z } -// Exp sets z=x**e and returns it -func (z *E4) Exp(x *E4, e big.Int) *E4 { - if e.IsUint64() && e.Uint64() == 0 { +// Exp sets z=xᵏ (mod q⁴) and returns it +func (z *E4) Exp(x E4, k *big.Int) *E4 { + if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } - k := e + e := k if k.Sign() == -1 { // negative k, we invert - // if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12) - x.Inverse(x) + // if k < 0: xᵏ (mod q⁴) == (x⁻¹)ᵏ (mod q⁴) + x.Inverse(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - k = *bigIntPool.Get().(*big.Int) - defer bigIntPool.Put(k) - k.Neg(&k) + e = bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(k) } - var res E4 - res.SetOne() + z.SetOne() b := e.Bytes() - for i := range b { + for i := 0; i < len(b); i++ { w := b[i] - mask := byte(0x80) - for j := 7; j >= 0; j-- { - res.Square(&res) - if (w&mask)>>j != 0 { - res.Mul(&res, x) + for j := 0; j < 8; j++ { + z.Square(z) + if (w & (0b10000000 >> j)) != 0 { + z.Mul(z, &x) } - mask = mask >> 1 } } - z.Set(&res) + return z } @@ -300,13 +297,13 @@ func (z *E4) Sqrt(x *E4) *E4 { var exp, one big.Int one.SetUint64(1) exp.Mul(q, q).Sub(&exp, &one).Rsh(&exp, 1) - d.Exp(&c, exp) + d.Exp(c, &exp) e.Mul(&d, &c).Inverse(&e) f.Mul(&d, &c).Square(&f) // computation exp.Rsh(&exp, 1) - b.Exp(x, exp) + b.Exp(*x, &exp) b.norm(&_b) o.SetOne() if _b.Equal(&o) { diff --git a/ecc/bls24-317/internal/fptower/e4_test.go b/ecc/bls24-317/internal/fptower/e4_test.go index 0afe602673..f0f9932b52 100644 --- a/ecc/bls24-317/internal/fptower/e4_test.go +++ b/ecc/bls24-317/internal/fptower/e4_test.go @@ -257,7 +257,7 @@ func TestE4Ops(t *testing.T) { var b, c E4 q := fp.Modulus() b.Frobenius(a) - c.Exp(a, *q) + c.Exp(*a, q) return c.Equal(&b) }, genA, diff --git a/ecc/bls24-317/pairing_test.go b/ecc/bls24-317/pairing_test.go index b4a4595f81..65bd739a75 100644 --- a/ecc/bls24-317/pairing_test.go +++ b/ecc/bls24-317/pairing_test.go @@ -77,9 +77,9 @@ func TestPairing(t *testing.T) { e.ToBigIntRegular(&_e) var b, c, d GT - b.Exp(&a, _e) - c.ExpGLV(&a, _e) - d.CyclotomicExp(&a, _e) + b.Exp(a, &_e) + c.ExpGLV(a, &_e) + d.CyclotomicExp(a, &_e) return b.Equal(&c) && c.Equal(&d) }, @@ -98,7 +98,7 @@ func TestPairing(t *testing.T) { Mul(&a, &b) c.Expt(&a).Expt(&c) - d.Exp(&a, xGen).Exp(&d, xGen) + d.Exp(a, &xGen).Exp(d, &xGen) return c.Equal(&d) }, genA, @@ -125,9 +125,9 @@ func TestPairing(t *testing.T) { resa, _ = Pair([]G1Affine{ag1}, []G2Affine{g2GenAff}) resb, _ = Pair([]G1Affine{g1GenAff}, []G2Affine{bg2}) - resab.Exp(&res, ab) - resa.Exp(&resa, bbigint) - resb.Exp(&resb, abigint) + resab.Exp(res, &ab) + resa.Exp(resa, &bbigint) + resb.Exp(resb, &abigint) return resab.Equal(&resa) && resab.Equal(&resb) && !res.Equal(&zero) @@ -394,21 +394,21 @@ func BenchmarkExpGT(b *testing.B) { b.Run("Naive windowed Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.Exp(&a, _e) + a.Exp(a, &_e) } }) b.Run("2-NAF cyclotomic Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.CyclotomicExp(&a, _e) + a.CyclotomicExp(a, &_e) } }) b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.ExpGLV(&a, _e) + a.ExpGLV(a, &_e) } }) } diff --git a/ecc/bn254/internal/fptower/e12.go b/ecc/bn254/internal/fptower/e12.go index 43f09f1f80..950d60de6c 100644 --- a/ecc/bn254/internal/fptower/e12.go +++ b/ecc/bn254/internal/fptower/e12.go @@ -415,9 +415,9 @@ func BatchInvertE12(a []E12) []E12 { return res } -// Exp sets z=x**k and returns it +// Exp sets z=xᵏ (mod q¹²) and returns it // uses 2-bits windowed method -func (z *E12) Exp(x *E12, k big.Int) *E12 { +func (z *E12) Exp(x E12, k *big.Int) *E12 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } @@ -426,20 +426,20 @@ func (z *E12) Exp(x *E12, k big.Int) *E12 { if k.Sign() == -1 { // negative k, we invert // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + x.Inverse(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var res E12 var ops [3]E12 res.SetOne() - ops[0].Set(x) + ops[0].Set(&x) ops[1].Square(&ops[0]) ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) @@ -461,37 +461,37 @@ func (z *E12) Exp(x *E12, k big.Int) *E12 { return z } -// CyclotomicExp sets z=x**k and returns it +// CyclotomicExp sets z=xᵏ (mod q¹²) and returns it // uses 2-NAF decomposition // x must be in the cyclotomic subgroup // TODO: use a windowed method -func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 { +func (z *E12) CyclotomicExp(x E12, k *big.Int) *E12 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } e := k if k.Sign() == -1 { - // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // negative k, we invert (=conjugate) + // if k < 0: xᵏ (mod q¹²) == (x⁻¹)ᵏ (mod q¹²) + x.Conjugate(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var res, xInv E12 - xInv.InverseUnitary(x) + xInv.InverseUnitary(&x) res.SetOne() eNAF := make([]int8, e.BitLen()+3) - n := ecc.NafDecomposition(&e, eNAF[:]) + n := ecc.NafDecomposition(e, eNAF[:]) for i := n - 1; i >= 0; i-- { res.CyclotomicSquare(&res) if eNAF[i] == 1 { - res.Mul(&res, x) + res.Mul(&res, &x) } else if eNAF[i] == -1 { res.Mul(&res, &xInv) } @@ -500,27 +500,27 @@ func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 { return z } -// ExpGLV sets z=x**k and returns it +// ExpGLV sets z=xᵏ (q¹²) and returns it // uses 2-dimensional GLV with 2-bits windowed method // x must be in GT // TODO: use 2-NAF // TODO: use higher dimensional decomposition -func (z *E12) ExpGLV(x *E12, k big.Int) *E12 { +func (z *E12) ExpGLV(x E12, k *big.Int) *E12 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } e := k if k.Sign() == -1 { - // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // negative k, we invert (=conjugate) + // if k < 0: xᵏ (mod q¹²) == (x⁻¹)ᵏ (mod q¹²) + x.Conjugate(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var table [15]E12 @@ -530,11 +530,11 @@ func (z *E12) ExpGLV(x *E12, k big.Int) *E12 { res.SetOne() // table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x - table[0].Set(x) - table[3].Frobenius(x) + table[0].Set(&x) + table[3].Frobenius(&x) - // split the scalar, modifies +-x, Frob(x) accordingly - s := ecc.SplitScalar(&e, &glvBasis) + // split the scalar, modifies ±x, Frob(x) accordingly + s := ecc.SplitScalar(e, &glvBasis) if s[0].Sign() == -1 { s[0].Neg(&s[0]) diff --git a/ecc/bn254/internal/fptower/e12_pairing.go b/ecc/bn254/internal/fptower/e12_pairing.go index 9b36b67816..a4abaf510d 100644 --- a/ecc/bn254/internal/fptower/e12_pairing.go +++ b/ecc/bn254/internal/fptower/e12_pairing.go @@ -12,7 +12,7 @@ func (z *E12) nSquareCompressed(n int) { } } -// Expt set z to xᵗ in E12 and return z (t is the generator of the curve) +// Expt set z to xᵗ (mod q¹²) and return z (t is the generator of the curve) func (z *E12) Expt(x *E12) *E12 { // Expt computation is derived from the addition chain: // diff --git a/ecc/bn254/internal/fptower/e12_test.go b/ecc/bn254/internal/fptower/e12_test.go index ad38127038..a503e238cd 100644 --- a/ecc/bn254/internal/fptower/e12_test.go +++ b/ecc/bn254/internal/fptower/e12_test.go @@ -391,8 +391,8 @@ func TestE12Ops(t *testing.T) { e.Exp(e, k) e.ToBigIntRegular(&_e) - c.Exp(a, _e) - d.CyclotomicExp(a, _e) + c.Exp(*a, &_e) + d.CyclotomicExp(*a, &_e) return c.Equal(&d) }, @@ -405,7 +405,7 @@ func TestE12Ops(t *testing.T) { var b, c E12 q := fp.Modulus() b.Frobenius(a) - c.Exp(a, *q) + c.Exp(*a, q) return c.Equal(&b) }, genA, @@ -416,7 +416,7 @@ func TestE12Ops(t *testing.T) { var b, c E12 q := fp.Modulus() b.FrobeniusSquare(a) - c.Exp(a, *q).Exp(&c, *q) + c.Exp(*a, q).Exp(c, q) return c.Equal(&b) }, genA, diff --git a/ecc/bn254/internal/fptower/e2.go b/ecc/bn254/internal/fptower/e2.go index f6bec9b381..3d12b8b7e8 100644 --- a/ecc/bn254/internal/fptower/e2.go +++ b/ecc/bn254/internal/fptower/e2.go @@ -170,23 +170,23 @@ func (z *E2) Legendre() int { return n.Legendre() } -// Exp sets z=x**e and returns it -func (z *E2) Exp(x E2, e *big.Int) *E2 { - if e.IsUint64() && e.Uint64() == 0 { +// Exp sets z=xᵏ (mod q²) and returns it +func (z *E2) Exp(x E2, k *big.Int) *E2 { + if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } - k := e + e := k if k.Sign() == -1 { // negative k, we invert - // if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12) + // if k < 0: xᵏ (mod q²) == (x⁻¹)ᵏ (mod q²) x.Inverse(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - k = bigIntPool.Get().(*big.Int) - defer bigIntPool.Put(k) - k.Neg(k) + e = bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(k) } z.SetOne() diff --git a/ecc/bn254/pairing_test.go b/ecc/bn254/pairing_test.go index e342770160..e29ae8bb76 100644 --- a/ecc/bn254/pairing_test.go +++ b/ecc/bn254/pairing_test.go @@ -76,9 +76,9 @@ func TestPairing(t *testing.T) { e.ToBigIntRegular(&_e) var b, c, d GT - b.Exp(&a, _e) - c.ExpGLV(&a, _e) - d.CyclotomicExp(&a, _e) + b.Exp(a, &_e) + c.ExpGLV(a, &_e) + d.CyclotomicExp(a, &_e) return b.Equal(&c) && c.Equal(&d) }, @@ -97,7 +97,7 @@ func TestPairing(t *testing.T) { Mul(&a, &b) c.Expt(&a).Expt(&c) - d.Exp(&a, xGen).Exp(&d, xGen) + d.Exp(a, &xGen).Exp(d, &xGen) return c.Equal(&d) }, genA, @@ -124,9 +124,9 @@ func TestPairing(t *testing.T) { resa, _ = Pair([]G1Affine{ag1}, []G2Affine{g2GenAff}) resb, _ = Pair([]G1Affine{g1GenAff}, []G2Affine{bg2}) - resab.Exp(&res, ab) - resa.Exp(&resa, bbigint) - resb.Exp(&resb, abigint) + resab.Exp(res, &ab) + resa.Exp(resa, &bbigint) + resb.Exp(resb, &abigint) return resab.Equal(&resa) && resab.Equal(&resb) && !res.Equal(&zero) @@ -393,21 +393,21 @@ func BenchmarkExpGT(b *testing.B) { b.Run("Naive windowed Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.Exp(&a, _e) + a.Exp(a, &_e) } }) b.Run("2-NAF cyclotomic Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.CyclotomicExp(&a, _e) + a.CyclotomicExp(a, &_e) } }) b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.ExpGLV(&a, _e) + a.ExpGLV(a, &_e) } }) } diff --git a/ecc/bw6-633/internal/fptower/e6.go b/ecc/bw6-633/internal/fptower/e6.go index fdc1afa39c..92f54a1dd3 100644 --- a/ecc/bw6-633/internal/fptower/e6.go +++ b/ecc/bw6-633/internal/fptower/e6.go @@ -155,25 +155,25 @@ func (z *E6) CyclotomicSquareCompressed(x *E6) *E6 { var t [7]fp.Element - // t0 = g1^2 + // t0 = g1² t[0].Square(&x.B0.A1) - // t1 = g5^2 + // t1 = g5² t[1].Square(&x.B1.A2) // t5 = g1 + g5 t[5].Add(&x.B0.A1, &x.B1.A2) - // t2 = (g1 + g5)^2 + // t2 = (g1 + g5)² t[2].Square(&t[5]) - // t3 = g1^2 + g5^2 + // t3 = g1² + g5² t[3].Add(&t[0], &t[1]) // t5 = 2 * g1 * g5 t[5].Sub(&t[2], &t[3]) // t6 = g3 + g2 t[6].Add(&x.B1.A0, &x.B0.A2) - // t3 = (g3 + g2)^2 + // t3 = (g3 + g2)² t[3].Square(&t[6]) - // t2 = g3^2 + // t2 = g3² t[2].Square(&x.B1.A0) // t6 = 2 * nr * g1 * g5 @@ -184,33 +184,33 @@ func (z *E6) CyclotomicSquareCompressed(x *E6) *E6 { // z3 = 6 * nr * g1 * g5 + 2 * g3 z.B1.A0.Add(&t[5], &t[6]) - // t4 = nr * g5^2 + // t4 = nr * g5² t[4].MulByNonResidue(&t[1]) - // t5 = nr * g5^2 + g1^2 + // t5 = nr * g5² + g1² t[5].Add(&t[0], &t[4]) - // t6 = nr * g5^2 + g1^2 - g2 + // t6 = nr * g5² + g1² - g2 t[6].Sub(&t[5], &x.B0.A2) - // t1 = g2^2 + // t1 = g2² t[1].Square(&x.B0.A2) - // t6 = 2 * nr * g5^2 + 2 * g1^2 - 2*g2 + // t6 = 2 * nr * g5² + 2 * g1² - 2*g2 t[6].Double(&t[6]) - // z2 = 3 * nr * g5^2 + 3 * g1^2 - 2*g2 + // z2 = 3 * nr * g5² + 3 * g1² - 2*g2 z.B0.A2.Add(&t[6], &t[5]) - // t4 = nr * g2^2 + // t4 = nr * g2² t[4].MulByNonResidue(&t[1]) - // t5 = g3^2 + nr * g2^2 + // t5 = g3² + nr * g2² t[5].Add(&t[2], &t[4]) - // t6 = g3^2 + nr * g2^2 - g1 + // t6 = g3² + nr * g2² - g1 t[6].Sub(&t[5], &x.B0.A1) - // t6 = 2 * g3^2 + 2 * nr * g2^2 - 2 * g1 + // t6 = 2 * g3² + 2 * nr * g2² - 2 * g1 t[6].Double(&t[6]) - // z1 = 3 * g3^2 + 3 * nr * g2^2 - 2 * g1 + // z1 = 3 * g3² + 3 * nr * g2² - 2 * g1 z.B0.A1.Add(&t[6], &t[5]) - // t0 = g2^2 + g3^2 + // t0 = g2² + g3² t[0].Add(&t[2], &t[1]) // t5 = 2 * g3 * g2 t[5].Sub(&t[3], &t[0]) @@ -231,13 +231,13 @@ func (z *E6) Decompress(x *E6) *E6 { var one fp.Element one.SetOne() - // t0 = g1^2 + // t0 = g1² t[0].Square(&x.B0.A1) - // t1 = 3 * g1^2 - 2 * g2 + // t1 = 3 * g1² - 2 * g2 t[1].Sub(&t[0], &x.B0.A2). Double(&t[1]). Add(&t[1], &t[0]) - // t0 = E * g5^2 + t1 + // t0 = E * g5² + t1 t[2].Square(&x.B1.A2) t[0].MulByNonResidue(&t[2]). Add(&t[0], &t[1]) @@ -250,14 +250,14 @@ func (z *E6) Decompress(x *E6) *E6 { // t1 = g2 * g1 t[1].Mul(&x.B0.A2, &x.B0.A1) - // t2 = 2 * g4^2 - 3 * g2 * g1 + // t2 = 2 * g4² - 3 * g2 * g1 t[2].Square(&x.B1.A1). Sub(&t[2], &t[1]). Double(&t[2]). Sub(&t[2], &t[1]) // t1 = g3 * g5 t[1].Mul(&x.B1.A0, &x.B1.A2) - // c_0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1 + // c₀ = E * (2 * g4² + g3 * g5 - 3 * g2 * g1) + 1 t[2].Add(&t[2], &t[1]) z.B0.A0.MulByNonResidue(&t[2]). Add(&z.B0.A0, &one) @@ -273,10 +273,10 @@ func (z *E6) Decompress(x *E6) *E6 { // Granger-Scott's cyclotomic square // https://eprint.iacr.org/2009/565.pdf, 3.2 func (z *E6) CyclotomicSquare(x *E6) *E6 { - // x=(x0,x1,x2,x3,x4,x5,x6,x7) in E3^6 - // cyclosquare(x)=(3*x4^2*u + 3*x0^2 - 2*x0, - // 3*x2^2*u + 3*x3^2 - 2*x1, - // 3*x5^2*u + 3*x1^2 - 2*x2, + // x=(x0,x1,x2,x3,x4,x5,x6,x7) in E3⁶ + // cyclosquare(x)=(3*x4²*u + 3*x0² - 2*x0, + // 3*x2²*u + 3*x3² - 2*x1, + // 3*x5²*u + 3*x1² - 2*x2, // 6*x1*x5*u + 2*x3, // 6*x0*x4 + 2*x4, // 6*x2*x3 + 2*x5) @@ -293,9 +293,9 @@ func (z *E6) CyclotomicSquare(x *E6) *E6 { t[5].Square(&x.B0.A1) t[8].Add(&x.B1.A2, &x.B0.A1).Square(&t[8]).Sub(&t[8], &t[4]).Sub(&t[8], &t[5]).MulByNonResidue(&t[8]) // 2*x5*x1*u - t[0].MulByNonResidue(&t[0]).Add(&t[0], &t[1]) // x4^2*u + x0^2 - t[2].MulByNonResidue(&t[2]).Add(&t[2], &t[3]) // x2^2*u + x3^2 - t[4].MulByNonResidue(&t[4]).Add(&t[4], &t[5]) // x5^2*u + x1^2 + t[0].MulByNonResidue(&t[0]).Add(&t[0], &t[1]) // x4²*u + x0² + t[2].MulByNonResidue(&t[2]).Add(&t[2], &t[3]) // x2²*u + x3² + t[4].MulByNonResidue(&t[4]).Add(&t[4], &t[5]) // x5²*u + x1² z.B0.A0.Sub(&t[0], &x.B0.A0).Double(&z.B0.A0).Add(&z.B0.A0, &t[0]) z.B0.A1.Sub(&t[2], &x.B0.A1).Double(&z.B0.A1).Add(&z.B0.A1, &t[2]) @@ -358,9 +358,9 @@ func BatchInvertE6(a []E6) []E6 { return res } -// Exp sets z=x**k and returns it +// Exp sets z=xᵏ (mod q⁶) and returns it // uses 2-bits windowed method -func (z *E6) Exp(x *E6, k big.Int) *E6 { +func (z *E6) Exp(x E6, k *big.Int) *E6 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } @@ -368,21 +368,21 @@ func (z *E6) Exp(x *E6, k big.Int) *E6 { e := k if k.Sign() == -1 { // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // if k < 0: xᵏ (mod q⁶) == (x⁻¹)ᵏ (mod q⁶) + x.Inverse(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var res E6 var ops [3]E6 res.SetOne() - ops[0].Set(x) + ops[0].Set(&x) ops[1].Square(&ops[0]) ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) @@ -404,37 +404,37 @@ func (z *E6) Exp(x *E6, k big.Int) *E6 { return z } -// CyclotomicExp sets z=x**k and returns it +// CyclotomicExp sets z=xᵏ (mod q⁶) and returns it // uses 2-NAF decomposition // x must be in the cyclotomic subgroup // TODO: use a windowed method -func (z *E6) CyclotomicExp(x *E6, k big.Int) *E6 { +func (z *E6) CyclotomicExp(x E6, k *big.Int) *E6 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } e := k if k.Sign() == -1 { - // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // negative k, we invert (=conjugate) + // if k < 0: xᵏ (mod q⁶) == (x⁻¹)ᵏ (mod q⁶) + x.Conjugate(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var res, xInv E6 - xInv.InverseUnitary(x) + xInv.InverseUnitary(&x) res.SetOne() eNAF := make([]int8, e.BitLen()+3) - n := ecc.NafDecomposition(&e, eNAF[:]) + n := ecc.NafDecomposition(e, eNAF[:]) for i := n - 1; i >= 0; i-- { res.CyclotomicSquare(&res) if eNAF[i] == 1 { - res.Mul(&res, x) + res.Mul(&res, &x) } else if eNAF[i] == -1 { res.Mul(&res, &xInv) } @@ -443,12 +443,12 @@ func (z *E6) CyclotomicExp(x *E6, k big.Int) *E6 { return z } -// ExpGLV sets z=x**k and returns it +// ExpGLV sets z=xᵏ (q⁶) and returns it // uses 2-dimensional GLV with 2-bits windowed method // x must be in GT // TODO: use 2-NAF // TODO: use higher dimensional decomposition -func (z *E6) ExpGLV(x *E6, k big.Int) *E6 { +func (z *E6) ExpGLV(x E6, k *big.Int) *E6 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } @@ -456,14 +456,14 @@ func (z *E6) ExpGLV(x *E6, k big.Int) *E6 { e := k if k.Sign() == -1 { // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // if k < 0: xᵏ (mod q⁶) == (x⁻¹)ᵏ (mod q⁶) + x.Conjugate(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var table [15]E6 @@ -473,11 +473,11 @@ func (z *E6) ExpGLV(x *E6, k big.Int) *E6 { res.SetOne() // table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x - table[0].Set(x) - table[3].Frobenius(x) + table[0].Set(&x) + table[3].Frobenius(&x) - // split the scalar, modifies +-x, Frob(x) accordingly - s := ecc.SplitScalar(&e, &glvBasis) + // split the scalar, modifies ±x, Frob(x) accordingly + s := ecc.SplitScalar(e, &glvBasis) if s[0].Sign() == -1 { s[0].Neg(&s[0]) @@ -618,9 +618,9 @@ func (z *E6) IsInSubGroup() bool { _a.Frobenius(z) a.CyclotomicSquare(&_a).Mul(&a, &_a) // z^(3p) - // t(x)-1 = (-10-4x-13x^2+6x^3+7x^4-23x^5+19x^6-12x^7+2x^8+11x^9-7x^10)/3 - t[0].CyclotomicSquare(z) // z^2 - t[1].CyclotomicSquare(&t[0]) // z^4 + // t(x)-1 = (-10-4x-13x²+6x³+7x⁴-23x⁵+19x⁶-12x⁷+2x⁸+11x⁹-7x¹⁰)/3 + t[0].CyclotomicSquare(z) // z² + t[1].CyclotomicSquare(&t[0]) // z⁴ t[2].CyclotomicSquare(&t[1]). Mul(&t[2], &t[0]). Conjugate(&t[2]) // *z^(-10) @@ -630,52 +630,52 @@ func (z *E6) IsInSubGroup() bool { Mul(&t[4], &t[2]). Mul(&t[4], z). Expt(&t[4]). - Expt(&t[4]) // *z^(-13u^2) + Expt(&t[4]) // *z^(-13u²) t[5].Mul(&t[0], &t[1]). Expt(&t[5]). Expt(&t[5]). - Expt(&t[5]) // *z^(6u^3) + Expt(&t[5]) // *z^(6u³) tmp.Expt(z). Expt(&tmp). - Expt(&tmp) // z^(u^3) + Expt(&tmp) // z^(u³) t[6].Mul(&tmp, &t[5]). - Expt(&t[6]) // *z^(7u^4) + Expt(&t[6]) // *z^(7u⁴) t[7].CyclotomicSquare(&t[5]). - CyclotomicSquare(&t[7]) // z^(24u^3) - tmp.Conjugate(&tmp) // z^(-u^3) + CyclotomicSquare(&t[7]) // z^(24u³) + tmp.Conjugate(&tmp) // z^(-u³) t[7].Mul(&t[7], &tmp). Conjugate(&t[7]). Expt(&t[7]). - Expt(&t[7]) // *z^(-23u^5) + Expt(&t[7]) // *z^(-23u⁵) t[8].Conjugate(&t[4]). Expt(&t[8]). Mul(&t[8], &t[5]). Expt(&t[8]). Expt(&t[8]). - Expt(&t[8]) // *z^(19u^6) + Expt(&t[8]) // *z^(19u⁶) t[9].Conjugate(&t[5]). CyclotomicSquare(&t[9]). Expt(&t[9]). Expt(&t[9]). Expt(&t[9]). - Expt(&t[9]) // *z^(-12u^7) + Expt(&t[9]) // *z^(-12u⁷) tmp.Expt(&t[7]). - Expt(&tmp) // z^(-23u^7) + Expt(&tmp) // z^(-23u⁷) t[10].Conjugate(&t[9]). CyclotomicSquare(&t[10]). - Mul(&t[10], &tmp) // z^(u^7) + Mul(&t[10], &tmp) // z^(u⁷) t[11].Mul(&t[9], &t[10]). Conjugate(&t[11]). Expt(&t[11]). - Expt(&t[11]) // *z^(11u^9) + Expt(&t[11]) // *z^(11u⁹) t[10].Expt(&t[10]). - CyclotomicSquare(&t[10]) // *z^(2u^8) + CyclotomicSquare(&t[10]) // *z^(2u⁸) t[12].Conjugate(&t[10]). CyclotomicSquare(&t[12]). Expt(&t[12]). Mul(&t[12], &t[11]). Expt(&t[12]). - Conjugate(&t[12]) // *z^(-7u^10) + Conjugate(&t[12]) // *z^(-7u¹⁰) b.Mul(&t[2], &t[3]). Mul(&b, &t[4]). @@ -693,10 +693,10 @@ func (z *E6) IsInSubGroup() bool { // CompressTorus GT/E6 element to half its size // z must be in the cyclotomic subgroup -// i.e. z^(p^4-p^2+1)=1 +// i.e. z^(p⁴-p²+1)=1 // e.g. GT // "COMPRESSION IN FINITE FIELDS AND TORUS-BASED CRYPTOGRAPHY", K. RUBIN AND A. SILVERBERG -// z.B1 == 0 only when z \in {-1,1} +// z.B1 == 0 only when z ∈ {-1,1} func (z *E6) CompressTorus() (E3, error) { if z.B1.IsZero() { diff --git a/ecc/bw6-633/internal/fptower/e6_test.go b/ecc/bw6-633/internal/fptower/e6_test.go index dd19286d57..8bda2d0922 100644 --- a/ecc/bw6-633/internal/fptower/e6_test.go +++ b/ecc/bw6-633/internal/fptower/e6_test.go @@ -328,8 +328,8 @@ func TestE6Ops(t *testing.T) { e.Exp(e, k) e.ToBigIntRegular(&_e) - c.Exp(a, _e) - d.CyclotomicExp(a, _e) + c.Exp(*a, &_e) + d.CyclotomicExp(*a, &_e) return c.Equal(&d) }, @@ -342,7 +342,7 @@ func TestE6Ops(t *testing.T) { var b, c E6 q := fp.Modulus() b.Frobenius(a) - c.Exp(a, *q) + c.Exp(*a, q) return c.Equal(&b) }, genA, diff --git a/ecc/bw6-633/pairing_test.go b/ecc/bw6-633/pairing_test.go index a8ed3ab2dc..3a3fd30c80 100644 --- a/ecc/bw6-633/pairing_test.go +++ b/ecc/bw6-633/pairing_test.go @@ -78,9 +78,9 @@ func TestPairing(t *testing.T) { e.ToBigIntRegular(&_e) var b, c, d GT - b.Exp(&a, _e) - c.ExpGLV(&a, _e) - d.CyclotomicExp(&a, _e) + b.Exp(a, &_e) + c.ExpGLV(a, &_e) + d.CyclotomicExp(a, &_e) return b.Equal(&c) && c.Equal(&d) }, @@ -99,7 +99,7 @@ func TestPairing(t *testing.T) { Mul(&a, &b) c.Expt(&a).Expt(&c) - d.Exp(&a, xGen).Exp(&d, xGen) + d.Exp(a, &xGen).Exp(d, &xGen) return c.Equal(&d) }, genA, @@ -126,9 +126,9 @@ func TestPairing(t *testing.T) { resa, _ = Pair([]G1Affine{ag1}, []G2Affine{g2GenAff}) resb, _ = Pair([]G1Affine{g1GenAff}, []G2Affine{bg2}) - resab.Exp(&res, ab) - resa.Exp(&resa, bbigint) - resb.Exp(&resb, abigint) + resab.Exp(res, &ab) + resa.Exp(resa, &bbigint) + resb.Exp(resb, &abigint) return resab.Equal(&resa) && resab.Equal(&resb) && !res.Equal(&zero) @@ -396,21 +396,21 @@ func BenchmarkExpGT(b *testing.B) { b.Run("Naive windowed Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.Exp(&a, _e) + a.Exp(a, &_e) } }) b.Run("2-NAF cyclotomic Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.CyclotomicExp(&a, _e) + a.CyclotomicExp(a, &_e) } }) b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.ExpGLV(&a, _e) + a.ExpGLV(a, &_e) } }) } diff --git a/ecc/bw6-756/internal/fptower/e6.go b/ecc/bw6-756/internal/fptower/e6.go index 16464db219..35d6b82a76 100644 --- a/ecc/bw6-756/internal/fptower/e6.go +++ b/ecc/bw6-756/internal/fptower/e6.go @@ -154,25 +154,25 @@ func (z *E6) CyclotomicSquareCompressed(x *E6) *E6 { var t [7]fp.Element - // t0 = g1^2 + // t0 = g1² t[0].Square(&x.B0.A1) - // t1 = g5^2 + // t1 = g5² t[1].Square(&x.B1.A2) // t5 = g1 + g5 t[5].Add(&x.B0.A1, &x.B1.A2) - // t2 = (g1 + g5)^2 + // t2 = (g1 + g5)² t[2].Square(&t[5]) - // t3 = g1^2 + g5^2 + // t3 = g1² + g5² t[3].Add(&t[0], &t[1]) // t5 = 2 * g1 * g5 t[5].Sub(&t[2], &t[3]) // t6 = g3 + g2 t[6].Add(&x.B1.A0, &x.B0.A2) - // t3 = (g3 + g2)^2 + // t3 = (g3 + g2)² t[3].Square(&t[6]) - // t2 = g3^2 + // t2 = g3² t[2].Square(&x.B1.A0) // t6 = 2 * nr * g1 * g5 @@ -183,33 +183,33 @@ func (z *E6) CyclotomicSquareCompressed(x *E6) *E6 { // z3 = 6 * nr * g1 * g5 + 2 * g3 z.B1.A0.Add(&t[5], &t[6]) - // t4 = nr * g5^2 + // t4 = nr * g5² t[4].MulByNonResidue(&t[1]) - // t5 = nr * g5^2 + g1^2 + // t5 = nr * g5² + g1² t[5].Add(&t[0], &t[4]) - // t6 = nr * g5^2 + g1^2 - g2 + // t6 = nr * g5² + g1² - g2 t[6].Sub(&t[5], &x.B0.A2) - // t1 = g2^2 + // t1 = g2² t[1].Square(&x.B0.A2) - // t6 = 2 * nr * g5^2 + 2 * g1^2 - 2*g2 + // t6 = 2 * nr * g5² + 2 * g1² - 2*g2 t[6].Double(&t[6]) - // z2 = 3 * nr * g5^2 + 3 * g1^2 - 2*g2 + // z2 = 3 * nr * g5² + 3 * g1² - 2*g2 z.B0.A2.Add(&t[6], &t[5]) - // t4 = nr * g2^2 + // t4 = nr * g2² t[4].MulByNonResidue(&t[1]) - // t5 = g3^2 + nr * g2^2 + // t5 = g3² + nr * g2² t[5].Add(&t[2], &t[4]) - // t6 = g3^2 + nr * g2^2 - g1 + // t6 = g3² + nr * g2² - g1 t[6].Sub(&t[5], &x.B0.A1) - // t6 = 2 * g3^2 + 2 * nr * g2^2 - 2 * g1 + // t6 = 2 * g3² + 2 * nr * g2² - 2 * g1 t[6].Double(&t[6]) - // z1 = 3 * g3^2 + 3 * nr * g2^2 - 2 * g1 + // z1 = 3 * g3² + 3 * nr * g2² - 2 * g1 z.B0.A1.Add(&t[6], &t[5]) - // t0 = g2^2 + g3^2 + // t0 = g2² + g3² t[0].Add(&t[2], &t[1]) // t5 = 2 * g3 * g2 t[5].Sub(&t[3], &t[0]) @@ -230,13 +230,13 @@ func (z *E6) Decompress(x *E6) *E6 { var one fp.Element one.SetOne() - // t0 = g1^2 + // t0 = g1² t[0].Square(&x.B0.A1) - // t1 = 3 * g1^2 - 2 * g2 + // t1 = 3 * g1² - 2 * g2 t[1].Sub(&t[0], &x.B0.A2). Double(&t[1]). Add(&t[1], &t[0]) - // t0 = E * g5^2 + t1 + // t0 = E * g5² + t1 t[2].Square(&x.B1.A2) t[0].MulByNonResidue(&t[2]). Add(&t[0], &t[1]) @@ -249,14 +249,14 @@ func (z *E6) Decompress(x *E6) *E6 { // t1 = g2 * g1 t[1].Mul(&x.B0.A2, &x.B0.A1) - // t2 = 2 * g4^2 - 3 * g2 * g1 + // t2 = 2 * g4² - 3 * g2 * g1 t[2].Square(&x.B1.A1). Sub(&t[2], &t[1]). Double(&t[2]). Sub(&t[2], &t[1]) // t1 = g3 * g5 t[1].Mul(&x.B1.A0, &x.B1.A2) - // c_0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1 + // c₀ = E * (2 * g4² + g3 * g5 - 3 * g2 * g1) + 1 t[2].Add(&t[2], &t[1]) z.B0.A0.MulByNonResidue(&t[2]). Add(&z.B0.A0, &one) @@ -272,10 +272,10 @@ func (z *E6) Decompress(x *E6) *E6 { // Granger-Scott's cyclotomic square // https://eprint.iacr.org/2009/565.pdf, 3.2 func (z *E6) CyclotomicSquare(x *E6) *E6 { - // x=(x0,x1,x2,x3,x4,x5,x6,x7) in E3^6 - // cyclosquare(x)=(3*x4^2*u + 3*x0^2 - 2*x0, - // 3*x2^2*u + 3*x3^2 - 2*x1, - // 3*x5^2*u + 3*x1^2 - 2*x2, + // x=(x0,x1,x2,x3,x4,x5,x6,x7) in E3⁶ + // cyclosquare(x)=(3*x4²*u + 3*x0² - 2*x0, + // 3*x2²*u + 3*x3² - 2*x1, + // 3*x5²*u + 3*x1² - 2*x2, // 6*x1*x5*u + 2*x3, // 6*x0*x4 + 2*x4, // 6*x2*x3 + 2*x5) @@ -292,9 +292,9 @@ func (z *E6) CyclotomicSquare(x *E6) *E6 { t[5].Square(&x.B0.A1) t[8].Add(&x.B1.A2, &x.B0.A1).Square(&t[8]).Sub(&t[8], &t[4]).Sub(&t[8], &t[5]).MulByNonResidue(&t[8]) // 2*x5*x1*u - t[0].MulByNonResidue(&t[0]).Add(&t[0], &t[1]) // x4^2*u + x0^2 - t[2].MulByNonResidue(&t[2]).Add(&t[2], &t[3]) // x2^2*u + x3^2 - t[4].MulByNonResidue(&t[4]).Add(&t[4], &t[5]) // x5^2*u + x1^2 + t[0].MulByNonResidue(&t[0]).Add(&t[0], &t[1]) // x4²*u + x0² + t[2].MulByNonResidue(&t[2]).Add(&t[2], &t[3]) // x2²*u + x3² + t[4].MulByNonResidue(&t[4]).Add(&t[4], &t[5]) // x5²*u + x1² z.B0.A0.Sub(&t[0], &x.B0.A0).Double(&z.B0.A0).Add(&z.B0.A0, &t[0]) z.B0.A1.Sub(&t[2], &x.B0.A1).Double(&z.B0.A1).Add(&z.B0.A1, &t[2]) @@ -357,9 +357,9 @@ func BatchInvertE6(a []E6) []E6 { return res } -// Exp sets z=x**k and returns it +// Exp sets z=xᵏ (mod q⁶) and returns it // uses 2-bits windowed method -func (z *E6) Exp(x *E6, k big.Int) *E6 { +func (z *E6) Exp(x E6, k *big.Int) *E6 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } @@ -367,21 +367,21 @@ func (z *E6) Exp(x *E6, k big.Int) *E6 { e := k if k.Sign() == -1 { // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // if k < 0: xᵏ (mod q⁶) == (x⁻¹)ᵏ (mod q⁶) + x.Inverse(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var res E6 var ops [3]E6 res.SetOne() - ops[0].Set(x) + ops[0].Set(&x) ops[1].Square(&ops[0]) ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) @@ -403,37 +403,37 @@ func (z *E6) Exp(x *E6, k big.Int) *E6 { return z } -// CyclotomicExp sets z=x**k and returns it +// CyclotomicExp sets z=xᵏ (mod q⁶) and returns it // uses 2-NAF decomposition // x must be in the cyclotomic subgroup // TODO: use a windowed method -func (z *E6) CyclotomicExp(x *E6, k big.Int) *E6 { +func (z *E6) CyclotomicExp(x E6, k *big.Int) *E6 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } e := k if k.Sign() == -1 { - // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // negative k, we invert (=conjugate) + // if k < 0: xᵏ (mod q⁶) == (x⁻¹)ᵏ (mod q⁶) + x.Conjugate(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var res, xInv E6 - xInv.InverseUnitary(x) + xInv.InverseUnitary(&x) res.SetOne() eNAF := make([]int8, e.BitLen()+3) - n := ecc.NafDecomposition(&e, eNAF[:]) + n := ecc.NafDecomposition(e, eNAF[:]) for i := n - 1; i >= 0; i-- { res.CyclotomicSquare(&res) if eNAF[i] == 1 { - res.Mul(&res, x) + res.Mul(&res, &x) } else if eNAF[i] == -1 { res.Mul(&res, &xInv) } @@ -442,12 +442,12 @@ func (z *E6) CyclotomicExp(x *E6, k big.Int) *E6 { return z } -// ExpGLV sets z=x**k and returns it +// ExpGLV sets z=xᵏ (q⁶) and returns it // uses 2-dimensional GLV with 2-bits windowed method // x must be in GT // TODO: use 2-NAF // TODO: use higher dimensional decomposition -func (z *E6) ExpGLV(x *E6, k big.Int) *E6 { +func (z *E6) ExpGLV(x E6, k *big.Int) *E6 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } @@ -455,14 +455,14 @@ func (z *E6) ExpGLV(x *E6, k big.Int) *E6 { e := k if k.Sign() == -1 { // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // if k < 0: xᵏ (mod q⁶) == (x⁻¹)ᵏ (mod q⁶) + x.Conjugate(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var table [15]E6 @@ -472,11 +472,11 @@ func (z *E6) ExpGLV(x *E6, k big.Int) *E6 { res.SetOne() // table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x - table[0].Set(x) - table[3].Frobenius(x) + table[0].Set(&x) + table[3].Frobenius(&x) - // split the scalar, modifies +-x, Frob(x) accordingly - s := ecc.SplitScalar(&e, &glvBasis) + // split the scalar, modifies ±x, Frob(x) accordingly + s := ecc.SplitScalar(e, &glvBasis) if s[0].Sign() == -1 { s[0].Neg(&s[0]) @@ -604,16 +604,16 @@ func (z *E6) SetBytes(e []byte) error { func (z *E6) IsInSubGroup() bool { var one, _z E6 one.SetOne() - _z.Exp(z, *fr.Modulus()) + _z.Exp(*z, fr.Modulus()) return _z.Equal(&one) } // CompressTorus GT/E6 element to half its size // z must be in the cyclotomic subgroup -// i.e. z^(p^4-p^2+1)=1 +// i.e. z^(p⁴-p²+1)=1 // e.g. GT // "COMPRESSION IN FINITE FIELDS AND TORUS-BASED CRYPTOGRAPHY", K. RUBIN AND A. SILVERBERG -// z.B1 == 0 only when z \in {-1,1} +// z.B1 == 0 only when z ∈ {-1,1} func (z *E6) CompressTorus() (E3, error) { if z.B1.IsZero() { diff --git a/ecc/bw6-756/internal/fptower/e6_test.go b/ecc/bw6-756/internal/fptower/e6_test.go index b74c1943bc..58048b0e8b 100644 --- a/ecc/bw6-756/internal/fptower/e6_test.go +++ b/ecc/bw6-756/internal/fptower/e6_test.go @@ -317,7 +317,7 @@ func TestE6Ops(t *testing.T) { var b, c E6 q := fp.Modulus() b.Frobenius(a) - c.Exp(a, *q) + c.Exp(*a, q) return c.Equal(&b) }, genA, diff --git a/ecc/bw6-756/pairing_test.go b/ecc/bw6-756/pairing_test.go index 466a7c797d..ef0b6f670d 100644 --- a/ecc/bw6-756/pairing_test.go +++ b/ecc/bw6-756/pairing_test.go @@ -77,9 +77,9 @@ func TestPairing(t *testing.T) { e.ToBigIntRegular(&_e) var b, c, d GT - b.Exp(&a, _e) - c.ExpGLV(&a, _e) - d.CyclotomicExp(&a, _e) + b.Exp(a, &_e) + c.ExpGLV(a, &_e) + d.CyclotomicExp(a, &_e) return b.Equal(&c) && c.Equal(&d) }, @@ -98,7 +98,7 @@ func TestPairing(t *testing.T) { Mul(&a, &b) c.Expt(&a).Expt(&c) - d.Exp(&a, xGen).Exp(&d, xGen) + d.Exp(a, &xGen).Exp(d, &xGen) return c.Equal(&d) }, genA, @@ -125,9 +125,9 @@ func TestPairing(t *testing.T) { resa, _ = Pair([]G1Affine{ag1}, []G2Affine{g2GenAff}) resb, _ = Pair([]G1Affine{g1GenAff}, []G2Affine{bg2}) - resab.Exp(&res, ab) - resa.Exp(&resa, bbigint) - resb.Exp(&resb, abigint) + resab.Exp(res, &ab) + resa.Exp(resa, &bbigint) + resb.Exp(resb, &abigint) return resab.Equal(&resa) && resab.Equal(&resb) && !res.Equal(&zero) @@ -394,21 +394,21 @@ func BenchmarkExpGT(b *testing.B) { b.Run("Naive windowed Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.Exp(&a, _e) + a.Exp(a, &_e) } }) b.Run("2-NAF cyclotomic Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.CyclotomicExp(&a, _e) + a.CyclotomicExp(a, &_e) } }) b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.ExpGLV(&a, _e) + a.ExpGLV(a, &_e) } }) } diff --git a/ecc/bw6-761/internal/fptower/e6.go b/ecc/bw6-761/internal/fptower/e6.go index 0f914d18d0..f211c4a0c7 100644 --- a/ecc/bw6-761/internal/fptower/e6.go +++ b/ecc/bw6-761/internal/fptower/e6.go @@ -154,25 +154,25 @@ func (z *E6) CyclotomicSquareCompressed(x *E6) *E6 { var t [7]fp.Element - // t0 = g1^2 + // t0 = g1² t[0].Square(&x.B0.A1) - // t1 = g5^2 + // t1 = g5² t[1].Square(&x.B1.A2) // t5 = g1 + g5 t[5].Add(&x.B0.A1, &x.B1.A2) - // t2 = (g1 + g5)^2 + // t2 = (g1 + g5)² t[2].Square(&t[5]) - // t3 = g1^2 + g5^2 + // t3 = g1² + g5² t[3].Add(&t[0], &t[1]) // t5 = 2 * g1 * g5 t[5].Sub(&t[2], &t[3]) // t6 = g3 + g2 t[6].Add(&x.B1.A0, &x.B0.A2) - // t3 = (g3 + g2)^2 + // t3 = (g3 + g2)² t[3].Square(&t[6]) - // t2 = g3^2 + // t2 = g3² t[2].Square(&x.B1.A0) // t6 = 2 * nr * g1 * g5 @@ -183,33 +183,33 @@ func (z *E6) CyclotomicSquareCompressed(x *E6) *E6 { // z3 = 6 * nr * g1 * g5 + 2 * g3 z.B1.A0.Add(&t[5], &t[6]) - // t4 = nr * g5^2 + // t4 = nr * g5² t[4].MulByNonResidue(&t[1]) - // t5 = nr * g5^2 + g1^2 + // t5 = nr * g5² + g1² t[5].Add(&t[0], &t[4]) - // t6 = nr * g5^2 + g1^2 - g2 + // t6 = nr * g5² + g1² - g2 t[6].Sub(&t[5], &x.B0.A2) - // t1 = g2^2 + // t1 = g2² t[1].Square(&x.B0.A2) - // t6 = 2 * nr * g5^2 + 2 * g1^2 - 2*g2 + // t6 = 2 * nr * g5² + 2 * g1² - 2*g2 t[6].Double(&t[6]) - // z2 = 3 * nr * g5^2 + 3 * g1^2 - 2*g2 + // z2 = 3 * nr * g5² + 3 * g1² - 2*g2 z.B0.A2.Add(&t[6], &t[5]) - // t4 = nr * g2^2 + // t4 = nr * g2² t[4].MulByNonResidue(&t[1]) - // t5 = g3^2 + nr * g2^2 + // t5 = g3² + nr * g2² t[5].Add(&t[2], &t[4]) - // t6 = g3^2 + nr * g2^2 - g1 + // t6 = g3² + nr * g2² - g1 t[6].Sub(&t[5], &x.B0.A1) - // t6 = 2 * g3^2 + 2 * nr * g2^2 - 2 * g1 + // t6 = 2 * g3² + 2 * nr * g2² - 2 * g1 t[6].Double(&t[6]) - // z1 = 3 * g3^2 + 3 * nr * g2^2 - 2 * g1 + // z1 = 3 * g3² + 3 * nr * g2² - 2 * g1 z.B0.A1.Add(&t[6], &t[5]) - // t0 = g2^2 + g3^2 + // t0 = g2² + g3² t[0].Add(&t[2], &t[1]) // t5 = 2 * g3 * g2 t[5].Sub(&t[3], &t[0]) @@ -230,13 +230,13 @@ func (z *E6) Decompress(x *E6) *E6 { var one fp.Element one.SetOne() - // t0 = g1^2 + // t0 = g1² t[0].Square(&x.B0.A1) - // t1 = 3 * g1^2 - 2 * g2 + // t1 = 3 * g1² - 2 * g2 t[1].Sub(&t[0], &x.B0.A2). Double(&t[1]). Add(&t[1], &t[0]) - // t0 = E * g5^2 + t1 + // t0 = E * g5² + t1 t[2].Square(&x.B1.A2) t[0].MulByNonResidue(&t[2]). Add(&t[0], &t[1]) @@ -249,14 +249,14 @@ func (z *E6) Decompress(x *E6) *E6 { // t1 = g2 * g1 t[1].Mul(&x.B0.A2, &x.B0.A1) - // t2 = 2 * g4^2 - 3 * g2 * g1 + // t2 = 2 * g4² - 3 * g2 * g1 t[2].Square(&x.B1.A1). Sub(&t[2], &t[1]). Double(&t[2]). Sub(&t[2], &t[1]) // t1 = g3 * g5 t[1].Mul(&x.B1.A0, &x.B1.A2) - // c_0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1 + // c₀ = E * (2 * g4² + g3 * g5 - 3 * g2 * g1) + 1 t[2].Add(&t[2], &t[1]) z.B0.A0.MulByNonResidue(&t[2]). Add(&z.B0.A0, &one) @@ -272,10 +272,10 @@ func (z *E6) Decompress(x *E6) *E6 { // Granger-Scott's cyclotomic square // https://eprint.iacr.org/2009/565.pdf, 3.2 func (z *E6) CyclotomicSquare(x *E6) *E6 { - // x=(x0,x1,x2,x3,x4,x5,x6,x7) in E3^6 - // cyclosquare(x)=(3*x4^2*u + 3*x0^2 - 2*x0, - // 3*x2^2*u + 3*x3^2 - 2*x1, - // 3*x5^2*u + 3*x1^2 - 2*x2, + // x=(x0,x1,x2,x3,x4,x5,x6,x7) in E3⁶ + // cyclosquare(x)=(3*x4²*u + 3*x0² - 2*x0, + // 3*x2²*u + 3*x3² - 2*x1, + // 3*x5²*u + 3*x1² - 2*x2, // 6*x1*x5*u + 2*x3, // 6*x0*x4 + 2*x4, // 6*x2*x3 + 2*x5) @@ -292,9 +292,9 @@ func (z *E6) CyclotomicSquare(x *E6) *E6 { t[5].Square(&x.B0.A1) t[8].Add(&x.B1.A2, &x.B0.A1).Square(&t[8]).Sub(&t[8], &t[4]).Sub(&t[8], &t[5]).MulByNonResidue(&t[8]) // 2*x5*x1*u - t[0].MulByNonResidue(&t[0]).Add(&t[0], &t[1]) // x4^2*u + x0^2 - t[2].MulByNonResidue(&t[2]).Add(&t[2], &t[3]) // x2^2*u + x3^2 - t[4].MulByNonResidue(&t[4]).Add(&t[4], &t[5]) // x5^2*u + x1^2 + t[0].MulByNonResidue(&t[0]).Add(&t[0], &t[1]) // x4²*u + x0² + t[2].MulByNonResidue(&t[2]).Add(&t[2], &t[3]) // x2²*u + x3² + t[4].MulByNonResidue(&t[4]).Add(&t[4], &t[5]) // x5²*u + x1² z.B0.A0.Sub(&t[0], &x.B0.A0).Double(&z.B0.A0).Add(&z.B0.A0, &t[0]) z.B0.A1.Sub(&t[2], &x.B0.A1).Double(&z.B0.A1).Add(&z.B0.A1, &t[2]) @@ -357,9 +357,9 @@ func BatchInvertE6(a []E6) []E6 { return res } -// Exp sets z=x**k and returns it +// Exp sets z=xᵏ (mod q⁶) and returns it // uses 2-bits windowed method -func (z *E6) Exp(x *E6, k big.Int) *E6 { +func (z *E6) Exp(x E6, k *big.Int) *E6 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } @@ -367,21 +367,21 @@ func (z *E6) Exp(x *E6, k big.Int) *E6 { e := k if k.Sign() == -1 { // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // if k < 0: xᵏ (mod q⁶) == (x⁻¹)ᵏ (mod q⁶) + x.Inverse(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var res E6 var ops [3]E6 res.SetOne() - ops[0].Set(x) + ops[0].Set(&x) ops[1].Square(&ops[0]) ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) @@ -403,37 +403,37 @@ func (z *E6) Exp(x *E6, k big.Int) *E6 { return z } -// CyclotomicExp sets z=x**k and returns it +// CyclotomicExp sets z=xᵏ (mod q⁶) and returns it // uses 2-NAF decomposition // x must be in the cyclotomic subgroup // TODO: use a windowed method -func (z *E6) CyclotomicExp(x *E6, k big.Int) *E6 { +func (z *E6) CyclotomicExp(x E6, k *big.Int) *E6 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } e := k if k.Sign() == -1 { - // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // negative k, we invert (=conjugate) + // if k < 0: xᵏ (mod q⁶) == (x⁻¹)ᵏ (mod q⁶) + x.Conjugate(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var res, xInv E6 - xInv.InverseUnitary(x) + xInv.InverseUnitary(&x) res.SetOne() eNAF := make([]int8, e.BitLen()+3) - n := ecc.NafDecomposition(&e, eNAF[:]) + n := ecc.NafDecomposition(e, eNAF[:]) for i := n - 1; i >= 0; i-- { res.CyclotomicSquare(&res) if eNAF[i] == 1 { - res.Mul(&res, x) + res.Mul(&res, &x) } else if eNAF[i] == -1 { res.Mul(&res, &xInv) } @@ -442,12 +442,12 @@ func (z *E6) CyclotomicExp(x *E6, k big.Int) *E6 { return z } -// ExpGLV sets z=x**k and returns it +// ExpGLV sets z=xᵏ (q⁶) and returns it // uses 2-dimensional GLV with 2-bits windowed method // x must be in GT // TODO: use 2-NAF // TODO: use higher dimensional decomposition -func (z *E6) ExpGLV(x *E6, k big.Int) *E6 { +func (z *E6) ExpGLV(x E6, k *big.Int) *E6 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } @@ -455,14 +455,14 @@ func (z *E6) ExpGLV(x *E6, k big.Int) *E6 { e := k if k.Sign() == -1 { // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // if k < 0: xᵏ (mod q⁶) == (x⁻¹)ᵏ (mod q⁶) + x.Conjugate(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var table [15]E6 @@ -472,11 +472,11 @@ func (z *E6) ExpGLV(x *E6, k big.Int) *E6 { res.SetOne() // table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x - table[0].Set(x) - table[3].Frobenius(x) + table[0].Set(&x) + table[3].Frobenius(&x) - // split the scalar, modifies +-x, Frob(x) accordingly - s := ecc.SplitScalar(&e, &glvBasis) + // split the scalar, modifies ±x, Frob(x) accordingly + s := ecc.SplitScalar(e, &glvBasis) if s[0].Sign() == -1 { s[0].Neg(&s[0]) @@ -616,13 +616,13 @@ func (z *E6) IsInSubGroup() bool { _a.Frobenius(z) a.CyclotomicSquare(&_a).Mul(&a, &_a) // z^(3p) - // t(x)-1 = (13x^6 − 23x^5 − 9x^4 + 35x^3 + 10x + 19)/3 - t[0].CyclotomicSquare(z) // z^2 + // t(x)-1 = (13x⁶ − 23x⁵ − 9x⁴ + 35x³ + 10x + 19)/3 + t[0].CyclotomicSquare(z) // z² t[1].CyclotomicSquare(&t[0]). - CyclotomicSquare(&t[1]) // z^8 + CyclotomicSquare(&t[1]) // z⁸ t[2].CyclotomicSquare(&t[1]). Mul(&t[2], &t[0]). - Mul(&t[2], z) // z^19* + Mul(&t[2], z) // z¹⁹* t[3].Mul(&t[0], &t[1]). Expt(&t[3]) // z^(10u)* t[4].CyclotomicSquare(&t[3]). @@ -632,25 +632,25 @@ func (z *E6) IsInSubGroup() bool { Expt(&t[0]) // z^(5u) t[4].Mul(&t[4], &t[0]). Expt(&t[4]). - Expt(&t[4]) // z^(35u^3)* + Expt(&t[4]) // z^(35u³)* t[1].Mul(&t[1], z). Expt(&t[1]). Expt(&t[1]). Expt(&t[1]). Expt(&t[1]). - Conjugate(&t[1]) // z^(-9u^4)* + Conjugate(&t[1]) // z^(-9u⁴)* t[0].Expt(&t[0]). Expt(&t[0]). Expt(&t[0]). - Conjugate(&t[0]) // z^(-5u^4) + Conjugate(&t[0]) // z^(-5u⁴) t[5].CyclotomicSquare(&t[1]). Mul(&t[5], &t[0]). - Expt(&t[5]) // z^(-23u^5)* + Expt(&t[5]) // z^(-23u⁵)* tmp.CyclotomicSquare(&t[1]). - Conjugate(&tmp) // z^(18u^4) + Conjugate(&tmp) // z^(18u⁴) t[0].Mul(&t[0], &tmp). Expt(&t[0]). - Expt(&t[0]) // z^(13u^6)* + Expt(&t[0]) // z^(13u⁶)* b.Mul(&t[2], &t[3]). Mul(&b, &t[4]). @@ -663,10 +663,10 @@ func (z *E6) IsInSubGroup() bool { // CompressTorus GT/E6 element to half its size // z must be in the cyclotomic subgroup -// i.e. z^(p^4-p^2+1)=1 +// i.e. z^(p⁴-p²+1)=1 // e.g. GT // "COMPRESSION IN FINITE FIELDS AND TORUS-BASED CRYPTOGRAPHY", K. RUBIN AND A. SILVERBERG -// z.B1 == 0 only when z \in {-1,1} +// z.B1 == 0 only when z ∈ {-1,1} func (z *E6) CompressTorus() (E3, error) { if z.B1.IsZero() { diff --git a/ecc/bw6-761/internal/fptower/e6_test.go b/ecc/bw6-761/internal/fptower/e6_test.go index 512ebc9438..4841bb4564 100644 --- a/ecc/bw6-761/internal/fptower/e6_test.go +++ b/ecc/bw6-761/internal/fptower/e6_test.go @@ -328,8 +328,8 @@ func TestE6Ops(t *testing.T) { e.Exp(e, k) e.ToBigIntRegular(&_e) - c.Exp(a, _e) - d.CyclotomicExp(a, _e) + c.Exp(*a, &_e) + d.CyclotomicExp(*a, &_e) return c.Equal(&d) }, @@ -342,7 +342,7 @@ func TestE6Ops(t *testing.T) { var b, c E6 q := fp.Modulus() b.Frobenius(a) - c.Exp(a, *q) + c.Exp(*a, q) return c.Equal(&b) }, genA, diff --git a/ecc/bw6-761/pairing_test.go b/ecc/bw6-761/pairing_test.go index 323deef40f..e92c59fa99 100644 --- a/ecc/bw6-761/pairing_test.go +++ b/ecc/bw6-761/pairing_test.go @@ -78,9 +78,9 @@ func TestPairing(t *testing.T) { e.ToBigIntRegular(&_e) var b, c, d GT - b.Exp(&a, _e) - c.ExpGLV(&a, _e) - d.CyclotomicExp(&a, _e) + b.Exp(a, &_e) + c.ExpGLV(a, &_e) + d.CyclotomicExp(a, &_e) return b.Equal(&c) && c.Equal(&d) }, @@ -99,7 +99,7 @@ func TestPairing(t *testing.T) { Mul(&a, &b) c.Expt(&a).Expt(&c) - d.Exp(&a, xGen).Exp(&d, xGen) + d.Exp(a, &xGen).Exp(d, &xGen) return c.Equal(&d) }, genA, @@ -126,9 +126,9 @@ func TestPairing(t *testing.T) { resa, _ = Pair([]G1Affine{ag1}, []G2Affine{g2GenAff}) resb, _ = Pair([]G1Affine{g1GenAff}, []G2Affine{bg2}) - resab.Exp(&res, ab) - resa.Exp(&resa, bbigint) - resb.Exp(&resb, abigint) + resab.Exp(res, &ab) + resa.Exp(resa, &bbigint) + resb.Exp(resb, &abigint) return resab.Equal(&resa) && resab.Equal(&resb) && !res.Equal(&zero) @@ -396,21 +396,21 @@ func BenchmarkExpGT(b *testing.B) { b.Run("Naive windowed Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.Exp(&a, _e) + a.Exp(a, &_e) } }) b.Run("2-NAF cyclotomic Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.CyclotomicExp(&a, _e) + a.CyclotomicExp(a, &_e) } }) b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.ExpGLV(&a, _e) + a.ExpGLV(a, &_e) } }) } diff --git a/internal/generator/pairing/template/tests/pairing.go.tmpl b/internal/generator/pairing/template/tests/pairing.go.tmpl index d77d7e45c8..5ea344b836 100644 --- a/internal/generator/pairing/template/tests/pairing.go.tmpl +++ b/internal/generator/pairing/template/tests/pairing.go.tmpl @@ -69,9 +69,9 @@ func TestPairing(t *testing.T) { e.ToBigIntRegular(&_e) var b, c, d GT - b.Exp(&a, _e) - c.ExpGLV(&a, _e) - d.CyclotomicExp(&a, _e) + b.Exp(a, &_e) + c.ExpGLV(a, &_e) + d.CyclotomicExp(a, &_e) return b.Equal(&c) && c.Equal(&d) }, @@ -95,7 +95,7 @@ func TestPairing(t *testing.T) { Mul(&a, &b) c.Expt(&a).Expt(&c) - d.Exp(&a, xGen).Exp(&d, xGen) + d.Exp(a, &xGen).Exp(d, &xGen) return c.Equal(&d) }, genA, @@ -122,9 +122,9 @@ func TestPairing(t *testing.T) { resa, _ = Pair([]G1Affine{ag1}, []G2Affine{g2GenAff}) resb, _ = Pair([]G1Affine{g1GenAff}, []G2Affine{bg2}) - resab.Exp(&res, ab) - resa.Exp(&resa, bbigint) - resb.Exp(&resb, abigint) + resab.Exp(res, &ab) + resa.Exp(resa, &bbigint) + resb.Exp(resb, &abigint) return resab.Equal(&resa) && resab.Equal(&resb) && !res.Equal(&zero) @@ -401,21 +401,21 @@ func BenchmarkExpGT(b *testing.B) { b.Run("Naive windowed Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.Exp(&a, _e) + a.Exp(a, &_e) } }) b.Run("2-NAF cyclotomic Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.CyclotomicExp(&a, _e) + a.CyclotomicExp(a, &_e) } }) b.Run("windowed 2-dim GLV Exp", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - a.ExpGLV(&a, _e) + a.ExpGLV(a, &_e) } }) } diff --git a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl index 5975f193b3..8fdcabdd00 100644 --- a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl +++ b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl @@ -398,9 +398,9 @@ func BatchInvertE12(a []E12) []E12 { return res } -// Exp sets z=x**k and returns it +// Exp sets z=xᵏ (mod q¹²) and returns it // uses 2-bits windowed method -func (z *E12) Exp(x *E12, k big.Int) *E12 { +func (z *E12) Exp(x E12, k *big.Int) *E12 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } @@ -409,20 +409,20 @@ func (z *E12) Exp(x *E12, k big.Int) *E12 { if k.Sign() == -1 { // negative k, we invert // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + x.Inverse(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var res E12 var ops [3]E12 res.SetOne() - ops[0].Set(x) + ops[0].Set(&x) ops[1].Square(&ops[0]) ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1]) @@ -444,37 +444,37 @@ func (z *E12) Exp(x *E12, k big.Int) *E12 { return z } -// CyclotomicExp sets z=x**k and returns it +// CyclotomicExp sets z=xᵏ (mod q¹²) and returns it // uses 2-NAF decomposition // x must be in the cyclotomic subgroup // TODO: use a windowed method -func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 { +func (z *E12) CyclotomicExp(x E12, k *big.Int) *E12 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } e := k if k.Sign() == -1 { - // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // negative k, we invert (=conjugate) + // if k < 0: xᵏ (mod q¹²) == (x⁻¹)ᵏ (mod q¹²) + x.Conjugate(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } var res, xInv E12 - xInv.InverseUnitary(x) + xInv.InverseUnitary(&x) res.SetOne() eNAF := make([]int8, e.BitLen()+3) - n := ecc.NafDecomposition(&e, eNAF[:]) + n := ecc.NafDecomposition(e, eNAF[:]) for i := n - 1; i >= 0; i-- { res.CyclotomicSquare(&res) if eNAF[i] == 1 { - res.Mul(&res, x) + res.Mul(&res, &x) } else if eNAF[i] == -1 { res.Mul(&res, &xInv) } @@ -483,41 +483,41 @@ func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 { return z } -// ExpGLV sets z=x**k and returns it +// ExpGLV sets z=xᵏ (q¹²) and returns it // uses 2-dimensional GLV with 2-bits windowed method // x must be in GT // TODO: use 2-NAF // TODO: use higher dimensional decomposition -func (z *E12) ExpGLV(x *E12, k big.Int) *E12 { +func (z *E12) ExpGLV(x E12, k *big.Int) *E12 { if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } e := k if k.Sign() == -1 { - // negative k, we invert - // if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q) - x.Inverse(x) + // negative k, we invert (=conjugate) + // if k < 0: xᵏ (mod q¹²) == (x⁻¹)ᵏ (mod q¹²) + x.Conjugate(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - e = *bigIntPool.Get().(*big.Int) + e = bigIntPool.Get().(*big.Int) defer bigIntPool.Put(e) - e.Neg(&k) + e.Neg(k) } - var table [15]E12 + var table [15]E12 var res E12 var s1, s2 fr.Element res.SetOne() // table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x - table[0].Set(x) - table[3].Frobenius(x) + table[0].Set(&x) + table[3].Frobenius(&x) - // split the scalar, modifies +-x, Frob(x) accordingly - s := ecc.SplitScalar(&e, &glvBasis) + // split the scalar, modifies ±x, Frob(x) accordingly + s := ecc.SplitScalar(e, &glvBasis) if s[0].Sign() == -1 { s[0].Neg(&s[0]) diff --git a/internal/generator/tower/template/fq12over6over2/fq2.go.tmpl b/internal/generator/tower/template/fq12over6over2/fq2.go.tmpl index 35ed62c3e7..11612bc060 100644 --- a/internal/generator/tower/template/fq12over6over2/fq2.go.tmpl +++ b/internal/generator/tower/template/fq12over6over2/fq2.go.tmpl @@ -155,32 +155,32 @@ func (z *E2) Legendre() int { return n.Legendre() } -// Exp sets z=x**e and returns it -func (z *E2) Exp(x E2, e *big.Int) *E2 { - if e.IsUint64() && e.Uint64() == 0 { +// Exp sets z=xᵏ (mod q²) and returns it +func (z *E2) Exp(x E2, k *big.Int) *E2 { + if k.IsUint64() && k.Uint64() == 0 { return z.SetOne() } - k := e + e := k if k.Sign() == -1 { // negative k, we invert - // if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12) + // if k < 0: xᵏ (mod q²) == (x⁻¹)ᵏ (mod q²) x.Inverse(&x) // we negate k in a temp big.Int since // Int.Bit(_) of k and -k is different - k = bigIntPool.Get().(*big.Int) - defer bigIntPool.Put(k) - k.Neg(k) + e = bigIntPool.Get().(*big.Int) + defer bigIntPool.Put(e) + e.Neg(k) } z.SetOne() - b := e.Bytes() - for i :=0;i> j)) != 0 { + if (w & (0b10000000 >> j)) != 0 { z.Mul(z, &x) } } diff --git a/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl b/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl index 08b83a19cb..e94dc8988a 100644 --- a/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl +++ b/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl @@ -376,8 +376,8 @@ func TestE12Ops(t *testing.T) { e.Exp(e, k) e.ToBigIntRegular(&_e) - c.Exp(a, _e) - d.CyclotomicExp(a, _e) + c.Exp(*a, &_e) + d.CyclotomicExp(*a, &_e) return c.Equal(&d) }, @@ -390,7 +390,7 @@ func TestE12Ops(t *testing.T) { var b, c E12 q := fp.Modulus() b.Frobenius(a) - c.Exp(a, *q) + c.Exp(*a, q) return c.Equal(&b) }, genA, @@ -401,7 +401,7 @@ func TestE12Ops(t *testing.T) { var b, c E12 q := fp.Modulus() b.FrobeniusSquare(a) - c.Exp(a, *q).Exp(&c, *q) + c.Exp(*a, q).Exp(c, q) return c.Equal(&b) }, genA, From 5762868ce05a2482ab2688cec0e85618b50a7cc2 Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Mon, 20 Jun 2022 11:23:27 +0100 Subject: [PATCH 13/16] test(tower): test negative exponent in Exp --- ecc/bls12-377/pairing_test.go | 9 ++++++--- ecc/bls12-378/pairing_test.go | 9 ++++++--- ecc/bls12-381/pairing_test.go | 9 ++++++--- ecc/bls24-315/pairing_test.go | 9 ++++++--- ecc/bls24-317/pairing_test.go | 9 ++++++--- ecc/bn254/pairing_test.go | 9 ++++++--- ecc/bw6-633/pairing_test.go | 9 ++++++--- ecc/bw6-756/pairing_test.go | 9 ++++++--- ecc/bw6-761/pairing_test.go | 9 ++++++--- .../generator/pairing/template/tests/pairing.go.tmpl | 9 ++++++--- 10 files changed, 60 insertions(+), 30 deletions(-) diff --git a/ecc/bls12-377/pairing_test.go b/ecc/bls12-377/pairing_test.go index cfa761546c..c64efebada 100644 --- a/ecc/bls12-377/pairing_test.go +++ b/ecc/bls12-377/pairing_test.go @@ -69,15 +69,18 @@ func TestPairing(t *testing.T) { func(a GT, e fp.Element) bool { a = FinalExponentiation(&a) - var _e big.Int + var _e, ne big.Int k := new(big.Int).SetUint64(12) e.Exp(e, k) e.ToBigIntRegular(&_e) + ne.Neg(&_e) var b, c, d GT - b.Exp(a, &_e) - c.ExpGLV(a, &_e) + b.Exp(a, &ne) + b.Inverse(&b) + c.ExpGLV(a, &ne) + c.Conjugate(&c) d.CyclotomicExp(a, &_e) return b.Equal(&c) && c.Equal(&d) diff --git a/ecc/bls12-378/pairing_test.go b/ecc/bls12-378/pairing_test.go index 020ace09a2..790d64d886 100644 --- a/ecc/bls12-378/pairing_test.go +++ b/ecc/bls12-378/pairing_test.go @@ -69,15 +69,18 @@ func TestPairing(t *testing.T) { func(a GT, e fp.Element) bool { a = FinalExponentiation(&a) - var _e big.Int + var _e, ne big.Int k := new(big.Int).SetUint64(12) e.Exp(e, k) e.ToBigIntRegular(&_e) + ne.Neg(&_e) var b, c, d GT - b.Exp(a, &_e) - c.ExpGLV(a, &_e) + b.Exp(a, &ne) + b.Inverse(&b) + c.ExpGLV(a, &ne) + c.Conjugate(&c) d.CyclotomicExp(a, &_e) return b.Equal(&c) && c.Equal(&d) diff --git a/ecc/bls12-381/pairing_test.go b/ecc/bls12-381/pairing_test.go index 5c45046b1c..3262379ef8 100644 --- a/ecc/bls12-381/pairing_test.go +++ b/ecc/bls12-381/pairing_test.go @@ -69,15 +69,18 @@ func TestPairing(t *testing.T) { func(a GT, e fp.Element) bool { a = FinalExponentiation(&a) - var _e big.Int + var _e, ne big.Int k := new(big.Int).SetUint64(12) e.Exp(e, k) e.ToBigIntRegular(&_e) + ne.Neg(&_e) var b, c, d GT - b.Exp(a, &_e) - c.ExpGLV(a, &_e) + b.Exp(a, &ne) + b.Inverse(&b) + c.ExpGLV(a, &ne) + c.Conjugate(&c) d.CyclotomicExp(a, &_e) return b.Equal(&c) && c.Equal(&d) diff --git a/ecc/bls24-315/pairing_test.go b/ecc/bls24-315/pairing_test.go index e0102f8864..44a645790f 100644 --- a/ecc/bls24-315/pairing_test.go +++ b/ecc/bls24-315/pairing_test.go @@ -70,16 +70,19 @@ func TestPairing(t *testing.T) { func(a GT, e fp.Element) bool { a = FinalExponentiation(&a) - var _e big.Int + var _e, ne big.Int k := new(big.Int).SetUint64(24) e.Exp(e, k) e.ToBigIntRegular(&_e) + ne.Neg(&_e) var b, c, d GT - b.Exp(a, &_e) - c.ExpGLV(a, &_e) + b.Exp(a, &ne) + b.Inverse(&b) + c.ExpGLV(a, &ne) + c.Conjugate(&c) d.CyclotomicExp(a, &_e) return b.Equal(&c) && c.Equal(&d) diff --git a/ecc/bls24-317/pairing_test.go b/ecc/bls24-317/pairing_test.go index 65bd739a75..23e7792c8f 100644 --- a/ecc/bls24-317/pairing_test.go +++ b/ecc/bls24-317/pairing_test.go @@ -70,15 +70,18 @@ func TestPairing(t *testing.T) { func(a GT, e fp.Element) bool { a = FinalExponentiation(&a) - var _e big.Int + var _e, ne big.Int k := new(big.Int).SetUint64(12) e.Exp(e, k) e.ToBigIntRegular(&_e) + ne.Neg(&_e) var b, c, d GT - b.Exp(a, &_e) - c.ExpGLV(a, &_e) + b.Exp(a, &ne) + b.Inverse(&b) + c.ExpGLV(a, &ne) + c.Conjugate(&c) d.CyclotomicExp(a, &_e) return b.Equal(&c) && c.Equal(&d) diff --git a/ecc/bn254/pairing_test.go b/ecc/bn254/pairing_test.go index e29ae8bb76..da33256083 100644 --- a/ecc/bn254/pairing_test.go +++ b/ecc/bn254/pairing_test.go @@ -69,15 +69,18 @@ func TestPairing(t *testing.T) { func(a GT, e fp.Element) bool { a = FinalExponentiation(&a) - var _e big.Int + var _e, ne big.Int k := new(big.Int).SetUint64(12) e.Exp(e, k) e.ToBigIntRegular(&_e) + ne.Neg(&_e) var b, c, d GT - b.Exp(a, &_e) - c.ExpGLV(a, &_e) + b.Exp(a, &ne) + b.Inverse(&b) + c.ExpGLV(a, &ne) + c.Conjugate(&c) d.CyclotomicExp(a, &_e) return b.Equal(&c) && c.Equal(&d) diff --git a/ecc/bw6-633/pairing_test.go b/ecc/bw6-633/pairing_test.go index 3a3fd30c80..f8a0f84bbf 100644 --- a/ecc/bw6-633/pairing_test.go +++ b/ecc/bw6-633/pairing_test.go @@ -70,16 +70,19 @@ func TestPairing(t *testing.T) { func(a GT, e fp.Element) bool { a = FinalExponentiation(&a) - var _e big.Int + var _e, ne big.Int k := new(big.Int).SetUint64(6) e.Exp(e, k) e.ToBigIntRegular(&_e) + ne.Neg(&_e) var b, c, d GT - b.Exp(a, &_e) - c.ExpGLV(a, &_e) + b.Exp(a, &ne) + b.Inverse(&b) + c.ExpGLV(a, &ne) + c.Conjugate(&c) d.CyclotomicExp(a, &_e) return b.Equal(&c) && c.Equal(&d) diff --git a/ecc/bw6-756/pairing_test.go b/ecc/bw6-756/pairing_test.go index ef0b6f670d..5c111bbd45 100644 --- a/ecc/bw6-756/pairing_test.go +++ b/ecc/bw6-756/pairing_test.go @@ -70,15 +70,18 @@ func TestPairing(t *testing.T) { func(a GT, e fp.Element) bool { a = FinalExponentiation(&a) - var _e big.Int + var _e, ne big.Int k := new(big.Int).SetUint64(12) e.Exp(e, k) e.ToBigIntRegular(&_e) + ne.Neg(&_e) var b, c, d GT - b.Exp(a, &_e) - c.ExpGLV(a, &_e) + b.Exp(a, &ne) + b.Inverse(&b) + c.ExpGLV(a, &ne) + c.Conjugate(&c) d.CyclotomicExp(a, &_e) return b.Equal(&c) && c.Equal(&d) diff --git a/ecc/bw6-761/pairing_test.go b/ecc/bw6-761/pairing_test.go index e92c59fa99..76bf81eb3a 100644 --- a/ecc/bw6-761/pairing_test.go +++ b/ecc/bw6-761/pairing_test.go @@ -70,16 +70,19 @@ func TestPairing(t *testing.T) { func(a GT, e fp.Element) bool { a = FinalExponentiation(&a) - var _e big.Int + var _e, ne big.Int k := new(big.Int).SetUint64(6) e.Exp(e, k) e.ToBigIntRegular(&_e) + ne.Neg(&_e) var b, c, d GT - b.Exp(a, &_e) - c.ExpGLV(a, &_e) + b.Exp(a, &ne) + b.Inverse(&b) + c.ExpGLV(a, &ne) + c.Conjugate(&c) d.CyclotomicExp(a, &_e) return b.Equal(&c) && c.Equal(&d) diff --git a/internal/generator/pairing/template/tests/pairing.go.tmpl b/internal/generator/pairing/template/tests/pairing.go.tmpl index 5ea344b836..a8209b7ac2 100644 --- a/internal/generator/pairing/template/tests/pairing.go.tmpl +++ b/internal/generator/pairing/template/tests/pairing.go.tmpl @@ -57,7 +57,7 @@ func TestPairing(t *testing.T) { func(a GT, e fp.Element) bool { a = FinalExponentiation(&a) - var _e big.Int + var _e, ne big.Int {{if or (eq .Name "bw6-761") (eq .Name "bw6-633")}} k := new(big.Int).SetUint64(6) {{else if eq .Name "bls24-315"}} @@ -67,10 +67,13 @@ func TestPairing(t *testing.T) { {{- end}} e.Exp(e, k) e.ToBigIntRegular(&_e) + ne.Neg(&_e) var b, c, d GT - b.Exp(a, &_e) - c.ExpGLV(a, &_e) + b.Exp(a, &ne) + b.Inverse(&b) + c.ExpGLV(a, &ne) + c.Conjugate(&c) d.CyclotomicExp(a, &_e) return b.Equal(&c) && c.Equal(&d) From c92a1d83ca94d74b95750d2be0f7dc5f322d129a Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Mon, 20 Jun 2022 18:01:11 +0100 Subject: [PATCH 14/16] =?UTF-8?q?feat:=20Multiexp=20=C3=A0=20la=20Pippenge?= =?UTF-8?q?r=20in=20GT?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ecc/bls12-377/pairing_test.go | 4 +- ecc/bls12-381/internal/fptower/multiexp.go | 1229 ++++++++++++++++++++ ecc/bls12-381/pairing_test.go | 177 +++ ecc/bls24-315/internal/fptower/multiexp.go | 1229 ++++++++++++++++++++ ecc/bls24-315/pairing_test.go | 182 ++- ecc/bls24-317/internal/fptower/multiexp.go | 1229 ++++++++++++++++++++ ecc/bls24-317/pairing_test.go | 180 ++- ecc/bn254/internal/fptower/multiexp.go | 1229 ++++++++++++++++++++ ecc/bn254/pairing_test.go | 177 +++ ecc/bw6-633/pairing_test.go | 180 ++- ecc/bw6-756/internal/fptower/multiexp.go | 569 +++++++++ ecc/bw6-756/pairing_test.go | 182 ++- ecc/bw6-761/internal/fptower/multiexp.go | 569 +++++++++ ecc/bw6-761/pairing_test.go | 180 ++- 14 files changed, 7299 insertions(+), 17 deletions(-) create mode 100644 ecc/bls12-381/internal/fptower/multiexp.go create mode 100644 ecc/bls24-315/internal/fptower/multiexp.go create mode 100644 ecc/bls24-317/internal/fptower/multiexp.go create mode 100644 ecc/bn254/internal/fptower/multiexp.go create mode 100644 ecc/bw6-756/internal/fptower/multiexp.go create mode 100644 ecc/bw6-761/internal/fptower/multiexp.go diff --git a/ecc/bls12-377/pairing_test.go b/ecc/bls12-377/pairing_test.go index fb33dd50da..49a6430e88 100644 --- a/ecc/bls12-377/pairing_test.go +++ b/ecc/bls12-377/pairing_test.go @@ -362,7 +362,7 @@ func TestMultiExpGT(t *testing.T) { // compute expected result with double and add var finalScalar, mixerBigInt big.Int finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt)) - expected.ExpGLV(&_g, &finalScalar) + expected.ExpGLV(_g, &finalScalar) // mixer ensures that all the words of a fpElement are set var sampleScalars [nbSamples]fr.Element @@ -417,7 +417,7 @@ func TestMultiExpGT(t *testing.T) { var op1ScalarMul GT finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer) finalBigScalar.ToBigIntRegular(&finalBigScalarBi) - op1ScalarMul.ExpGLV(&_g, &finalBigScalarBi) + op1ScalarMul.ExpGLV(_g, &finalBigScalarBi) return op1ScalarMul.Equal(&op1MultiExp) }, diff --git a/ecc/bls12-381/internal/fptower/multiexp.go b/ecc/bls12-381/internal/fptower/multiexp.go new file mode 100644 index 0000000000..a79d38b06e --- /dev/null +++ b/ecc/bls12-381/internal/fptower/multiexp.go @@ -0,0 +1,1229 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fptower + +import ( + "errors" + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bls12-381/fr" + "github.com/consensys/gnark-crypto/internal/parallel" + "math" + "runtime" +) + +/* Multi-Exponentiation à la Pippenger */ + +// selector stores the index, mask and shifts needed to select bits from a scalar +// it is used during the multiExp algorithm or the batch scalar multiplication +type selector struct { + index uint64 // index in the multi-word scalar to select bits from + mask uint64 // mask (c-bit wide) + shift uint64 // shift needed to get our bits on low positions + + multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) + maskHigh uint64 // same than mask, for index+1 + shiftHigh uint64 // same than shift, for index+1 +} + +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { + toReturn := make([]fr.Element, len(scalars)) + + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) + } + selectors[chunk] = d + } + + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) + + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int + + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.IsUint64() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } + + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] + + // init with carry if any + digit := carry + carry = 0 + + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + + // if digit is zero, no impact on result + if digit == 0 { + continue + } + + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } + + var bits uint64 + if digit >= 0 { + bits = uint64(digit) + } else { + bits = uint64(-digit-1) | msbWindow + } + + toReturn[i][s.index] |= (bits << s.shift) + if s.multiWordSelect { + toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues +} + +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +func (p *E12) MultiExp(points []E12, scalars []fr.Element, config ecc.MultiExpConfig) (*E12, error) { + // note: + // each of the MsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each MsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented MsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerE12 , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]E12, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerE12(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerE12(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.Mul(p, &_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerE12(p *E12, c int, points []E12, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.MsmC4(points, scalars, splitFirstChunk) + + case 5: + p.MsmC5(points, scalars, splitFirstChunk) + + case 6: + p.MsmC6(points, scalars, splitFirstChunk) + + case 7: + p.MsmC7(points, scalars, splitFirstChunk) + + case 8: + p.MsmC8(points, scalars, splitFirstChunk) + + case 9: + p.MsmC9(points, scalars, splitFirstChunk) + + case 10: + p.MsmC10(points, scalars, splitFirstChunk) + + case 11: + p.MsmC11(points, scalars, splitFirstChunk) + + case 12: + p.MsmC12(points, scalars, splitFirstChunk) + + case 13: + p.MsmC13(points, scalars, splitFirstChunk) + + case 14: + p.MsmC14(points, scalars, splitFirstChunk) + + case 15: + p.MsmC15(points, scalars, splitFirstChunk) + + case 16: + p.MsmC16(points, scalars, splitFirstChunk) + + case 20: + p.MsmC20(points, scalars, splitFirstChunk) + + case 21: + p.MsmC21(points, scalars, splitFirstChunk) + + case 22: + p.MsmC22(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkE12 reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkE12(p *E12, c int, chChunks []chan E12) *E12 { + var _p E12 + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.CyclotomicSquare(&_p) + } + totalj := <-chChunks[j] + _p.Mul(&_p, &totalj) + } + + p.Set(&_p) + return p +} + +func msmProcessChunkE12(chunk uint64, + chRes chan<- E12, + buckets []E12, + c uint64, + points []E12, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].SetOne() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + var tmp E12 + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].Mul(&buckets[bits-1], &points[i]) + } else { + // sub + tmp.Conjugate(&points[i]) + buckets[bits & ^msbWindow].Mul(&buckets[bits & ^msbWindow], &tmp) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var one, runningSum, total E12 + runningSum.SetOne() + total.SetOne() + one.SetOne() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].Equal(&one) { + runningSum.Mul(&runningSum, &buckets[k]) + } + total.Mul(&total, &runningSum) + } + + chRes <- total + +} + +func (p *E12) MsmC4(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 4 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC5(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 5 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC6(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 6 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC7(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 7 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC8(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 8 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC9(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 9 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC10(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 10 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC11(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 11 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC12(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 12 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC13(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 13 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC14(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 14 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC15(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 15 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC16(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC20(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 20 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC21(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 21 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC22(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 22 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} diff --git a/ecc/bls12-381/pairing_test.go b/ecc/bls12-381/pairing_test.go index 3262379ef8..1162ce3a97 100644 --- a/ecc/bls12-381/pairing_test.go +++ b/ecc/bls12-381/pairing_test.go @@ -19,8 +19,11 @@ package bls12381 import ( "fmt" "math/big" + "math/bits" + "runtime" "testing" + "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls12-381/fp" "github.com/consensys/gnark-crypto/ecc/bls12-381/fr" "github.com/leanovate/gopter" @@ -284,6 +287,146 @@ func TestMillerLoop(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestMultiExpGT(t *testing.T) { + + parameters := gopter.DefaultTestParameters() + parameters.MinSuccessfulTests = 2 + + properties := gopter.NewProperties(parameters) + + genScalar := GenFr() + + // size of the multiExps + const nbSamples = 143 + + // multi exp points + var samplePoints [nbSamples]GT + var _g, g GT + _g.SetRandom() + + // put into GT + _g = FinalExponentiation(&_g) + + g.Set(&_g) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].Set(&g) + g.Mul(&g, &_g) + } + + // final scalar to use in double and add method (without mixer factor) + // n(n+1)(2n+1)/6 (sum of the squares from 1 to n) + var scalar big.Int + scalar.SetInt64(nbSamples) + scalar.Mul(&scalar, new(big.Int).SetInt64(nbSamples+1)) + scalar.Mul(&scalar, new(big.Int).SetInt64(2*nbSamples+1)) + scalar.Div(&scalar, new(big.Int).SetInt64(6)) + + // ensure a multiexp that's splitted has the same result as a non-splitted one.. + properties.Property("[GT] Multi exponentation (c=16) should be consistant with splitted multiexp", prop.ForAll( + func(mixer fr.Element) bool { + var samplePointsLarge [nbSamples * 13]GT + for i := 0; i < 13; i++ { + copy(samplePointsLarge[i*nbSamples:], samplePoints[:]) + } + + var r16, splitted1, splitted2 GT + + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples * 13]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) + r16.MsmC16(samplePoints[:], scalars16, true) + + splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) + splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) + return r16.Equal(&splitted1) && r16.Equal(&splitted2) + }, + genScalar, + )) + + // we test only c = 5 and c = 16 + properties.Property("[GT] Multi exponentation (c=5, c=16) should be consistant with sum of square", prop.ForAll( + func(mixer fr.Element) bool { + + var expected, g GT + g.SetRandom() + // put into GT + g = FinalExponentiation(&_g) + + // compute expected result with double and add + var finalScalar, mixerBigInt big.Int + finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt)) + expected.ExpGLV(_g, &finalScalar) + + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + scalars5, _ := partitionScalars(sampleScalars[:], 5, false, runtime.NumCPU()) + scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) + + var r5, r16 GT + r5.MsmC5(samplePoints[:], scalars5, false) + r16.MsmC16(samplePoints[:], scalars16, true) + return (r5.Equal(&expected) && r16.Equal(&expected)) + }, + genScalar, + )) + + // note : this test is here as we expect to have a different multiExp than the above bucket method + // for small number of points + properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll( + func(mixer fr.Element) bool { + + var _g, g GT + _g.SetRandom() + + // put into GT + _g = FinalExponentiation(&_g) + + g.Set(&_g) + + // mixer ensures that all the words of a fpElement are set + samplePoints := make([]GT, 30) + sampleScalars := make([]fr.Element, 30) + + for i := 1; i <= 30; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + samplePoints[i-1].Set(&g) + g.Mul(&g, &_g) + } + + var op1MultiExp GT + op1MultiExp.MultiExp(samplePoints, sampleScalars, ecc.MultiExpConfig{}) + + var finalBigScalar fr.Element + var finalBigScalarBi big.Int + var op1ScalarMul GT + finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer) + finalBigScalar.ToBigIntRegular(&finalBigScalarBi) + op1ScalarMul.ExpGLV(_g, &finalBigScalarBi) + + return op1ScalarMul.Equal(&op1MultiExp) + }, + genScalar, + )) + + properties.TestingRun(t, gopter.ConsoleReporter(false)) +} + // ------------------------------------------------------------ // benches @@ -414,3 +557,37 @@ func BenchmarkExpGT(b *testing.B) { } }) } + +func BenchmarkMultiExpGT(b *testing.B) { + // ensure every words of the scalars are filled + var mixer fr.Element + mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") + + const pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits + const nbSamples = 1 << pow + + var samplePoints [nbSamples]GT + var sampleScalars [nbSamples]fr.Element + var _g GT + _g.SetRandom() + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + samplePoints[i-1].Set(&_g) + } + + var testPoint GT + + for i := 5; i <= pow; i++ { + using := 1 << i + + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) + } +} diff --git a/ecc/bls24-315/internal/fptower/multiexp.go b/ecc/bls24-315/internal/fptower/multiexp.go new file mode 100644 index 0000000000..72d09fce70 --- /dev/null +++ b/ecc/bls24-315/internal/fptower/multiexp.go @@ -0,0 +1,1229 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fptower + +import ( + "errors" + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bls24-315/fr" + "github.com/consensys/gnark-crypto/internal/parallel" + "math" + "runtime" +) + +/* Multi-Exponentiation à la Pippenger */ + +// selector stores the index, mask and shifts needed to select bits from a scalar +// it is used during the multiExp algorithm or the batch scalar multiplication +type selector struct { + index uint64 // index in the multi-word scalar to select bits from + mask uint64 // mask (c-bit wide) + shift uint64 // shift needed to get our bits on low positions + + multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) + maskHigh uint64 // same than mask, for index+1 + shiftHigh uint64 // same than shift, for index+1 +} + +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { + toReturn := make([]fr.Element, len(scalars)) + + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) + } + selectors[chunk] = d + } + + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) + + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int + + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.IsUint64() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } + + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] + + // init with carry if any + digit := carry + carry = 0 + + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + + // if digit is zero, no impact on result + if digit == 0 { + continue + } + + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } + + var bits uint64 + if digit >= 0 { + bits = uint64(digit) + } else { + bits = uint64(-digit-1) | msbWindow + } + + toReturn[i][s.index] |= (bits << s.shift) + if s.multiWordSelect { + toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues +} + +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +func (p *E24) MultiExp(points []E24, scalars []fr.Element, config ecc.MultiExpConfig) (*E24, error) { + // note: + // each of the MsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each MsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented MsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerE24 , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]E24, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerE24(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerE24(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.Mul(p, &_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerE24(p *E24, c int, points []E24, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.MsmC4(points, scalars, splitFirstChunk) + + case 5: + p.MsmC5(points, scalars, splitFirstChunk) + + case 6: + p.MsmC6(points, scalars, splitFirstChunk) + + case 7: + p.MsmC7(points, scalars, splitFirstChunk) + + case 8: + p.MsmC8(points, scalars, splitFirstChunk) + + case 9: + p.MsmC9(points, scalars, splitFirstChunk) + + case 10: + p.MsmC10(points, scalars, splitFirstChunk) + + case 11: + p.MsmC11(points, scalars, splitFirstChunk) + + case 12: + p.MsmC12(points, scalars, splitFirstChunk) + + case 13: + p.MsmC13(points, scalars, splitFirstChunk) + + case 14: + p.MsmC14(points, scalars, splitFirstChunk) + + case 15: + p.MsmC15(points, scalars, splitFirstChunk) + + case 16: + p.MsmC16(points, scalars, splitFirstChunk) + + case 20: + p.MsmC20(points, scalars, splitFirstChunk) + + case 21: + p.MsmC21(points, scalars, splitFirstChunk) + + case 22: + p.MsmC22(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkE24 reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkE24(p *E24, c int, chChunks []chan E24) *E24 { + var _p E24 + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.CyclotomicSquare(&_p) + } + totalj := <-chChunks[j] + _p.Mul(&_p, &totalj) + } + + p.Set(&_p) + return p +} + +func msmProcessChunkE24(chunk uint64, + chRes chan<- E24, + buckets []E24, + c uint64, + points []E24, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].SetOne() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + var tmp E24 + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].Mul(&buckets[bits-1], &points[i]) + } else { + // sub + tmp.Conjugate(&points[i]) + buckets[bits & ^msbWindow].Mul(&buckets[bits & ^msbWindow], &tmp) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var one, runningSum, total E24 + runningSum.SetOne() + total.SetOne() + one.SetOne() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].Equal(&one) { + runningSum.Mul(&runningSum, &buckets[k]) + } + total.Mul(&total, &runningSum) + } + + chRes <- total + +} + +func (p *E24) MsmC4(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 4 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC5(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 5 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC6(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 6 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC7(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 7 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC8(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 8 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC9(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 9 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC10(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 10 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC11(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 11 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC12(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 12 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC13(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 13 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC14(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 14 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC15(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 15 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC16(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC20(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 20 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC21(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 21 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC22(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 22 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} diff --git a/ecc/bls24-315/pairing_test.go b/ecc/bls24-315/pairing_test.go index 44a645790f..1663205050 100644 --- a/ecc/bls24-315/pairing_test.go +++ b/ecc/bls24-315/pairing_test.go @@ -19,8 +19,11 @@ package bls24315 import ( "fmt" "math/big" + "math/bits" + "runtime" "testing" + "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls24-315/fp" "github.com/consensys/gnark-crypto/ecc/bls24-315/fr" "github.com/leanovate/gopter" @@ -43,7 +46,6 @@ func TestPairing(t *testing.T) { properties := gopter.NewProperties(parameters) genA := GenE24() - genR1 := GenFr() genR2 := GenFr() genP := GenFp() @@ -73,7 +75,6 @@ func TestPairing(t *testing.T) { var _e, ne big.Int k := new(big.Int).SetUint64(24) - e.Exp(e, k) e.ToBigIntRegular(&_e) ne.Neg(&_e) @@ -286,6 +287,146 @@ func TestMillerLoop(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestMultiExpGT(t *testing.T) { + + parameters := gopter.DefaultTestParameters() + parameters.MinSuccessfulTests = 2 + + properties := gopter.NewProperties(parameters) + + genScalar := GenFr() + + // size of the multiExps + const nbSamples = 143 + + // multi exp points + var samplePoints [nbSamples]GT + var _g, g GT + _g.SetRandom() + + // put into GT + _g = FinalExponentiation(&_g) + + g.Set(&_g) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].Set(&g) + g.Mul(&g, &_g) + } + + // final scalar to use in double and add method (without mixer factor) + // n(n+1)(2n+1)/6 (sum of the squares from 1 to n) + var scalar big.Int + scalar.SetInt64(nbSamples) + scalar.Mul(&scalar, new(big.Int).SetInt64(nbSamples+1)) + scalar.Mul(&scalar, new(big.Int).SetInt64(2*nbSamples+1)) + scalar.Div(&scalar, new(big.Int).SetInt64(6)) + + // ensure a multiexp that's splitted has the same result as a non-splitted one.. + properties.Property("[GT] Multi exponentation (c=16) should be consistant with splitted multiexp", prop.ForAll( + func(mixer fr.Element) bool { + var samplePointsLarge [nbSamples * 13]GT + for i := 0; i < 13; i++ { + copy(samplePointsLarge[i*nbSamples:], samplePoints[:]) + } + + var r16, splitted1, splitted2 GT + + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples * 13]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) + r16.MsmC16(samplePoints[:], scalars16, true) + + splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) + splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) + return r16.Equal(&splitted1) && r16.Equal(&splitted2) + }, + genScalar, + )) + + // we test only c = 5 and c = 16 + properties.Property("[GT] Multi exponentation (c=5, c=16) should be consistant with sum of square", prop.ForAll( + func(mixer fr.Element) bool { + + var expected, g GT + g.SetRandom() + // put into GT + g = FinalExponentiation(&_g) + + // compute expected result with double and add + var finalScalar, mixerBigInt big.Int + finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt)) + expected.ExpGLV(_g, &finalScalar) + + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + scalars5, _ := partitionScalars(sampleScalars[:], 5, false, runtime.NumCPU()) + scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) + + var r5, r16 GT + r5.MsmC5(samplePoints[:], scalars5, false) + r16.MsmC16(samplePoints[:], scalars16, true) + return (r5.Equal(&expected) && r16.Equal(&expected)) + }, + genScalar, + )) + + // note : this test is here as we expect to have a different multiExp than the above bucket method + // for small number of points + properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll( + func(mixer fr.Element) bool { + + var _g, g GT + _g.SetRandom() + + // put into GT + _g = FinalExponentiation(&_g) + + g.Set(&_g) + + // mixer ensures that all the words of a fpElement are set + samplePoints := make([]GT, 30) + sampleScalars := make([]fr.Element, 30) + + for i := 1; i <= 30; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + samplePoints[i-1].Set(&g) + g.Mul(&g, &_g) + } + + var op1MultiExp GT + op1MultiExp.MultiExp(samplePoints, sampleScalars, ecc.MultiExpConfig{}) + + var finalBigScalar fr.Element + var finalBigScalarBi big.Int + var op1ScalarMul GT + finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer) + finalBigScalar.ToBigIntRegular(&finalBigScalarBi) + op1ScalarMul.ExpGLV(_g, &finalBigScalarBi) + + return op1ScalarMul.Equal(&op1MultiExp) + }, + genScalar, + )) + + properties.TestingRun(t, gopter.ConsoleReporter(false)) +} + // ------------------------------------------------------------ // benches @@ -390,8 +531,7 @@ func BenchmarkExpGT(b *testing.B) { var e fp.Element e.SetRandom() - k := new(big.Int).SetUint64(24) - + k := new(big.Int).SetUint64(12) e.Exp(e, k) var _e big.Int e.ToBigIntRegular(&_e) @@ -417,3 +557,37 @@ func BenchmarkExpGT(b *testing.B) { } }) } + +func BenchmarkMultiExpGT(b *testing.B) { + // ensure every words of the scalars are filled + var mixer fr.Element + mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") + + const pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits + const nbSamples = 1 << pow + + var samplePoints [nbSamples]GT + var sampleScalars [nbSamples]fr.Element + var _g GT + _g.SetRandom() + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + samplePoints[i-1].Set(&_g) + } + + var testPoint GT + + for i := 5; i <= pow; i++ { + using := 1 << i + + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) + } +} diff --git a/ecc/bls24-317/internal/fptower/multiexp.go b/ecc/bls24-317/internal/fptower/multiexp.go new file mode 100644 index 0000000000..7bc7ef3d1a --- /dev/null +++ b/ecc/bls24-317/internal/fptower/multiexp.go @@ -0,0 +1,1229 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fptower + +import ( + "errors" + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bls24-317/fr" + "github.com/consensys/gnark-crypto/internal/parallel" + "math" + "runtime" +) + +/* Multi-Exponentiation à la Pippenger */ + +// selector stores the index, mask and shifts needed to select bits from a scalar +// it is used during the multiExp algorithm or the batch scalar multiplication +type selector struct { + index uint64 // index in the multi-word scalar to select bits from + mask uint64 // mask (c-bit wide) + shift uint64 // shift needed to get our bits on low positions + + multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) + maskHigh uint64 // same than mask, for index+1 + shiftHigh uint64 // same than shift, for index+1 +} + +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { + toReturn := make([]fr.Element, len(scalars)) + + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) + } + selectors[chunk] = d + } + + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) + + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int + + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.IsUint64() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } + + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] + + // init with carry if any + digit := carry + carry = 0 + + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + + // if digit is zero, no impact on result + if digit == 0 { + continue + } + + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } + + var bits uint64 + if digit >= 0 { + bits = uint64(digit) + } else { + bits = uint64(-digit-1) | msbWindow + } + + toReturn[i][s.index] |= (bits << s.shift) + if s.multiWordSelect { + toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues +} + +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +func (p *E24) MultiExp(points []E24, scalars []fr.Element, config ecc.MultiExpConfig) (*E24, error) { + // note: + // each of the MsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each MsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented MsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerE24 , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]E24, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerE24(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerE24(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.Mul(p, &_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerE24(p *E24, c int, points []E24, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.MsmC4(points, scalars, splitFirstChunk) + + case 5: + p.MsmC5(points, scalars, splitFirstChunk) + + case 6: + p.MsmC6(points, scalars, splitFirstChunk) + + case 7: + p.MsmC7(points, scalars, splitFirstChunk) + + case 8: + p.MsmC8(points, scalars, splitFirstChunk) + + case 9: + p.MsmC9(points, scalars, splitFirstChunk) + + case 10: + p.MsmC10(points, scalars, splitFirstChunk) + + case 11: + p.MsmC11(points, scalars, splitFirstChunk) + + case 12: + p.MsmC12(points, scalars, splitFirstChunk) + + case 13: + p.MsmC13(points, scalars, splitFirstChunk) + + case 14: + p.MsmC14(points, scalars, splitFirstChunk) + + case 15: + p.MsmC15(points, scalars, splitFirstChunk) + + case 16: + p.MsmC16(points, scalars, splitFirstChunk) + + case 20: + p.MsmC20(points, scalars, splitFirstChunk) + + case 21: + p.MsmC21(points, scalars, splitFirstChunk) + + case 22: + p.MsmC22(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkE24 reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkE24(p *E24, c int, chChunks []chan E24) *E24 { + var _p E24 + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.CyclotomicSquare(&_p) + } + totalj := <-chChunks[j] + _p.Mul(&_p, &totalj) + } + + p.Set(&_p) + return p +} + +func msmProcessChunkE24(chunk uint64, + chRes chan<- E24, + buckets []E24, + c uint64, + points []E24, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].SetOne() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + var tmp E24 + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].Mul(&buckets[bits-1], &points[i]) + } else { + // sub + tmp.Conjugate(&points[i]) + buckets[bits & ^msbWindow].Mul(&buckets[bits & ^msbWindow], &tmp) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var one, runningSum, total E24 + runningSum.SetOne() + total.SetOne() + one.SetOne() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].Equal(&one) { + runningSum.Mul(&runningSum, &buckets[k]) + } + total.Mul(&total, &runningSum) + } + + chRes <- total + +} + +func (p *E24) MsmC4(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 4 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC5(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 5 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC6(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 6 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC7(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 7 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC8(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 8 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC9(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 9 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC10(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 10 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC11(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 11 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC12(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 12 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC13(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 13 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC14(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 14 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC15(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 15 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC16(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC20(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 20 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC21(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 21 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} + +func (p *E24) MsmC22(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 { + const ( + c = 22 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E24 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E24, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E24, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E24 + msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) { + var buckets [1 << (c - 1)]E24 + msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E24, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE24(p, c, chChunks[:]) +} diff --git a/ecc/bls24-317/pairing_test.go b/ecc/bls24-317/pairing_test.go index 23e7792c8f..55b48fe256 100644 --- a/ecc/bls24-317/pairing_test.go +++ b/ecc/bls24-317/pairing_test.go @@ -19,8 +19,11 @@ package bls24317 import ( "fmt" "math/big" + "math/bits" + "runtime" "testing" + "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bls24-317/fp" "github.com/consensys/gnark-crypto/ecc/bls24-317/fr" "github.com/leanovate/gopter" @@ -43,7 +46,6 @@ func TestPairing(t *testing.T) { properties := gopter.NewProperties(parameters) genA := GenE24() - genR1 := GenFr() genR2 := GenFr() genP := GenFp() @@ -285,6 +287,146 @@ func TestMillerLoop(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestMultiExpGT(t *testing.T) { + + parameters := gopter.DefaultTestParameters() + parameters.MinSuccessfulTests = 2 + + properties := gopter.NewProperties(parameters) + + genScalar := GenFr() + + // size of the multiExps + const nbSamples = 143 + + // multi exp points + var samplePoints [nbSamples]GT + var _g, g GT + _g.SetRandom() + + // put into GT + _g = FinalExponentiation(&_g) + + g.Set(&_g) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].Set(&g) + g.Mul(&g, &_g) + } + + // final scalar to use in double and add method (without mixer factor) + // n(n+1)(2n+1)/6 (sum of the squares from 1 to n) + var scalar big.Int + scalar.SetInt64(nbSamples) + scalar.Mul(&scalar, new(big.Int).SetInt64(nbSamples+1)) + scalar.Mul(&scalar, new(big.Int).SetInt64(2*nbSamples+1)) + scalar.Div(&scalar, new(big.Int).SetInt64(6)) + + // ensure a multiexp that's splitted has the same result as a non-splitted one.. + properties.Property("[GT] Multi exponentation (c=16) should be consistant with splitted multiexp", prop.ForAll( + func(mixer fr.Element) bool { + var samplePointsLarge [nbSamples * 13]GT + for i := 0; i < 13; i++ { + copy(samplePointsLarge[i*nbSamples:], samplePoints[:]) + } + + var r16, splitted1, splitted2 GT + + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples * 13]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) + r16.MsmC16(samplePoints[:], scalars16, true) + + splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) + splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) + return r16.Equal(&splitted1) && r16.Equal(&splitted2) + }, + genScalar, + )) + + // we test only c = 5 and c = 16 + properties.Property("[GT] Multi exponentation (c=5, c=16) should be consistant with sum of square", prop.ForAll( + func(mixer fr.Element) bool { + + var expected, g GT + g.SetRandom() + // put into GT + g = FinalExponentiation(&_g) + + // compute expected result with double and add + var finalScalar, mixerBigInt big.Int + finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt)) + expected.ExpGLV(_g, &finalScalar) + + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + scalars5, _ := partitionScalars(sampleScalars[:], 5, false, runtime.NumCPU()) + scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) + + var r5, r16 GT + r5.MsmC5(samplePoints[:], scalars5, false) + r16.MsmC16(samplePoints[:], scalars16, true) + return (r5.Equal(&expected) && r16.Equal(&expected)) + }, + genScalar, + )) + + // note : this test is here as we expect to have a different multiExp than the above bucket method + // for small number of points + properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll( + func(mixer fr.Element) bool { + + var _g, g GT + _g.SetRandom() + + // put into GT + _g = FinalExponentiation(&_g) + + g.Set(&_g) + + // mixer ensures that all the words of a fpElement are set + samplePoints := make([]GT, 30) + sampleScalars := make([]fr.Element, 30) + + for i := 1; i <= 30; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + samplePoints[i-1].Set(&g) + g.Mul(&g, &_g) + } + + var op1MultiExp GT + op1MultiExp.MultiExp(samplePoints, sampleScalars, ecc.MultiExpConfig{}) + + var finalBigScalar fr.Element + var finalBigScalarBi big.Int + var op1ScalarMul GT + finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer) + finalBigScalar.ToBigIntRegular(&finalBigScalarBi) + op1ScalarMul.ExpGLV(_g, &finalBigScalarBi) + + return op1ScalarMul.Equal(&op1MultiExp) + }, + genScalar, + )) + + properties.TestingRun(t, gopter.ConsoleReporter(false)) +} + // ------------------------------------------------------------ // benches @@ -389,7 +531,7 @@ func BenchmarkExpGT(b *testing.B) { var e fp.Element e.SetRandom() - k := new(big.Int).SetUint64(12) + k := new(big.Int).SetUint64(24) e.Exp(e, k) var _e big.Int e.ToBigIntRegular(&_e) @@ -415,3 +557,37 @@ func BenchmarkExpGT(b *testing.B) { } }) } + +func BenchmarkMultiExpGT(b *testing.B) { + // ensure every words of the scalars are filled + var mixer fr.Element + mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") + + const pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits + const nbSamples = 1 << pow + + var samplePoints [nbSamples]GT + var sampleScalars [nbSamples]fr.Element + var _g GT + _g.SetRandom() + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + samplePoints[i-1].Set(&_g) + } + + var testPoint GT + + for i := 5; i <= pow; i++ { + using := 1 << i + + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) + } +} diff --git a/ecc/bn254/internal/fptower/multiexp.go b/ecc/bn254/internal/fptower/multiexp.go new file mode 100644 index 0000000000..41d1c06ec0 --- /dev/null +++ b/ecc/bn254/internal/fptower/multiexp.go @@ -0,0 +1,1229 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fptower + +import ( + "errors" + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bn254/fr" + "github.com/consensys/gnark-crypto/internal/parallel" + "math" + "runtime" +) + +/* Multi-Exponentiation à la Pippenger */ + +// selector stores the index, mask and shifts needed to select bits from a scalar +// it is used during the multiExp algorithm or the batch scalar multiplication +type selector struct { + index uint64 // index in the multi-word scalar to select bits from + mask uint64 // mask (c-bit wide) + shift uint64 // shift needed to get our bits on low positions + + multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) + maskHigh uint64 // same than mask, for index+1 + shiftHigh uint64 // same than shift, for index+1 +} + +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { + toReturn := make([]fr.Element, len(scalars)) + + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) + } + selectors[chunk] = d + } + + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) + + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int + + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.IsUint64() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } + + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] + + // init with carry if any + digit := carry + carry = 0 + + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + + // if digit is zero, no impact on result + if digit == 0 { + continue + } + + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } + + var bits uint64 + if digit >= 0 { + bits = uint64(digit) + } else { + bits = uint64(-digit-1) | msbWindow + } + + toReturn[i][s.index] |= (bits << s.shift) + if s.multiWordSelect { + toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues +} + +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +func (p *E12) MultiExp(points []E12, scalars []fr.Element, config ecc.MultiExpConfig) (*E12, error) { + // note: + // each of the MsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each MsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented MsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerE12 , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]E12, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerE12(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerE12(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.Mul(p, &_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerE12(p *E12, c int, points []E12, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.MsmC4(points, scalars, splitFirstChunk) + + case 5: + p.MsmC5(points, scalars, splitFirstChunk) + + case 6: + p.MsmC6(points, scalars, splitFirstChunk) + + case 7: + p.MsmC7(points, scalars, splitFirstChunk) + + case 8: + p.MsmC8(points, scalars, splitFirstChunk) + + case 9: + p.MsmC9(points, scalars, splitFirstChunk) + + case 10: + p.MsmC10(points, scalars, splitFirstChunk) + + case 11: + p.MsmC11(points, scalars, splitFirstChunk) + + case 12: + p.MsmC12(points, scalars, splitFirstChunk) + + case 13: + p.MsmC13(points, scalars, splitFirstChunk) + + case 14: + p.MsmC14(points, scalars, splitFirstChunk) + + case 15: + p.MsmC15(points, scalars, splitFirstChunk) + + case 16: + p.MsmC16(points, scalars, splitFirstChunk) + + case 20: + p.MsmC20(points, scalars, splitFirstChunk) + + case 21: + p.MsmC21(points, scalars, splitFirstChunk) + + case 22: + p.MsmC22(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkE12 reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkE12(p *E12, c int, chChunks []chan E12) *E12 { + var _p E12 + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.CyclotomicSquare(&_p) + } + totalj := <-chChunks[j] + _p.Mul(&_p, &totalj) + } + + p.Set(&_p) + return p +} + +func msmProcessChunkE12(chunk uint64, + chRes chan<- E12, + buckets []E12, + c uint64, + points []E12, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].SetOne() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + var tmp E12 + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].Mul(&buckets[bits-1], &points[i]) + } else { + // sub + tmp.Conjugate(&points[i]) + buckets[bits & ^msbWindow].Mul(&buckets[bits & ^msbWindow], &tmp) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var one, runningSum, total E12 + runningSum.SetOne() + total.SetOne() + one.SetOne() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].Equal(&one) { + runningSum.Mul(&runningSum, &buckets[k]) + } + total.Mul(&total, &runningSum) + } + + chRes <- total + +} + +func (p *E12) MsmC4(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 4 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC5(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 5 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC6(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 6 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC7(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 7 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC8(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 8 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC9(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 9 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC10(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 10 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC11(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 11 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC12(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 12 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC13(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 13 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC14(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 14 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC15(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 15 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC16(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC20(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 20 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC21(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 21 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} + +func (p *E12) MsmC22(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 { + const ( + c = 22 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E12 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E12, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E12, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E12 + msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) { + var buckets [1 << (c - 1)]E12 + msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E12, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE12(p, c, chChunks[:]) +} diff --git a/ecc/bn254/pairing_test.go b/ecc/bn254/pairing_test.go index da33256083..ff4252106b 100644 --- a/ecc/bn254/pairing_test.go +++ b/ecc/bn254/pairing_test.go @@ -19,8 +19,11 @@ package bn254 import ( "fmt" "math/big" + "math/bits" + "runtime" "testing" + "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bn254/fp" "github.com/consensys/gnark-crypto/ecc/bn254/fr" "github.com/leanovate/gopter" @@ -284,6 +287,146 @@ func TestMillerLoop(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestMultiExpGT(t *testing.T) { + + parameters := gopter.DefaultTestParameters() + parameters.MinSuccessfulTests = 2 + + properties := gopter.NewProperties(parameters) + + genScalar := GenFr() + + // size of the multiExps + const nbSamples = 143 + + // multi exp points + var samplePoints [nbSamples]GT + var _g, g GT + _g.SetRandom() + + // put into GT + _g = FinalExponentiation(&_g) + + g.Set(&_g) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].Set(&g) + g.Mul(&g, &_g) + } + + // final scalar to use in double and add method (without mixer factor) + // n(n+1)(2n+1)/6 (sum of the squares from 1 to n) + var scalar big.Int + scalar.SetInt64(nbSamples) + scalar.Mul(&scalar, new(big.Int).SetInt64(nbSamples+1)) + scalar.Mul(&scalar, new(big.Int).SetInt64(2*nbSamples+1)) + scalar.Div(&scalar, new(big.Int).SetInt64(6)) + + // ensure a multiexp that's splitted has the same result as a non-splitted one.. + properties.Property("[GT] Multi exponentation (c=16) should be consistant with splitted multiexp", prop.ForAll( + func(mixer fr.Element) bool { + var samplePointsLarge [nbSamples * 13]GT + for i := 0; i < 13; i++ { + copy(samplePointsLarge[i*nbSamples:], samplePoints[:]) + } + + var r16, splitted1, splitted2 GT + + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples * 13]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) + r16.MsmC16(samplePoints[:], scalars16, true) + + splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) + splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) + return r16.Equal(&splitted1) && r16.Equal(&splitted2) + }, + genScalar, + )) + + // we test only c = 5 and c = 16 + properties.Property("[GT] Multi exponentation (c=5, c=16) should be consistant with sum of square", prop.ForAll( + func(mixer fr.Element) bool { + + var expected, g GT + g.SetRandom() + // put into GT + g = FinalExponentiation(&_g) + + // compute expected result with double and add + var finalScalar, mixerBigInt big.Int + finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt)) + expected.ExpGLV(_g, &finalScalar) + + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + scalars5, _ := partitionScalars(sampleScalars[:], 5, false, runtime.NumCPU()) + scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) + + var r5, r16 GT + r5.MsmC5(samplePoints[:], scalars5, false) + r16.MsmC16(samplePoints[:], scalars16, true) + return (r5.Equal(&expected) && r16.Equal(&expected)) + }, + genScalar, + )) + + // note : this test is here as we expect to have a different multiExp than the above bucket method + // for small number of points + properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll( + func(mixer fr.Element) bool { + + var _g, g GT + _g.SetRandom() + + // put into GT + _g = FinalExponentiation(&_g) + + g.Set(&_g) + + // mixer ensures that all the words of a fpElement are set + samplePoints := make([]GT, 30) + sampleScalars := make([]fr.Element, 30) + + for i := 1; i <= 30; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + samplePoints[i-1].Set(&g) + g.Mul(&g, &_g) + } + + var op1MultiExp GT + op1MultiExp.MultiExp(samplePoints, sampleScalars, ecc.MultiExpConfig{}) + + var finalBigScalar fr.Element + var finalBigScalarBi big.Int + var op1ScalarMul GT + finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer) + finalBigScalar.ToBigIntRegular(&finalBigScalarBi) + op1ScalarMul.ExpGLV(_g, &finalBigScalarBi) + + return op1ScalarMul.Equal(&op1MultiExp) + }, + genScalar, + )) + + properties.TestingRun(t, gopter.ConsoleReporter(false)) +} + // ------------------------------------------------------------ // benches @@ -414,3 +557,37 @@ func BenchmarkExpGT(b *testing.B) { } }) } + +func BenchmarkMultiExpGT(b *testing.B) { + // ensure every words of the scalars are filled + var mixer fr.Element + mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") + + const pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits + const nbSamples = 1 << pow + + var samplePoints [nbSamples]GT + var sampleScalars [nbSamples]fr.Element + var _g GT + _g.SetRandom() + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + samplePoints[i-1].Set(&_g) + } + + var testPoint GT + + for i := 5; i <= pow; i++ { + using := 1 << i + + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) + } +} diff --git a/ecc/bw6-633/pairing_test.go b/ecc/bw6-633/pairing_test.go index f8a0f84bbf..e46b0086c1 100644 --- a/ecc/bw6-633/pairing_test.go +++ b/ecc/bw6-633/pairing_test.go @@ -19,8 +19,11 @@ package bw6633 import ( "fmt" "math/big" + "math/bits" + "runtime" "testing" + "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-633/fp" "github.com/consensys/gnark-crypto/ecc/bw6-633/fr" "github.com/leanovate/gopter" @@ -43,7 +46,6 @@ func TestPairing(t *testing.T) { properties := gopter.NewProperties(parameters) genA := GenE6() - genR1 := GenFr() genR2 := GenFr() genP := GenFp() @@ -73,7 +75,6 @@ func TestPairing(t *testing.T) { var _e, ne big.Int k := new(big.Int).SetUint64(6) - e.Exp(e, k) e.ToBigIntRegular(&_e) ne.Neg(&_e) @@ -286,6 +287,146 @@ func TestMillerLoop(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestMultiExpGT(t *testing.T) { + + parameters := gopter.DefaultTestParameters() + parameters.MinSuccessfulTests = 2 + + properties := gopter.NewProperties(parameters) + + genScalar := GenFr() + + // size of the multiExps + const nbSamples = 143 + + // multi exp points + var samplePoints [nbSamples]GT + var _g, g GT + _g.SetRandom() + + // put into GT + _g = FinalExponentiation(&_g) + + g.Set(&_g) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].Set(&g) + g.Mul(&g, &_g) + } + + // final scalar to use in double and add method (without mixer factor) + // n(n+1)(2n+1)/6 (sum of the squares from 1 to n) + var scalar big.Int + scalar.SetInt64(nbSamples) + scalar.Mul(&scalar, new(big.Int).SetInt64(nbSamples+1)) + scalar.Mul(&scalar, new(big.Int).SetInt64(2*nbSamples+1)) + scalar.Div(&scalar, new(big.Int).SetInt64(6)) + + // ensure a multiexp that's splitted has the same result as a non-splitted one.. + properties.Property("[GT] Multi exponentation (c=16) should be consistant with splitted multiexp", prop.ForAll( + func(mixer fr.Element) bool { + var samplePointsLarge [nbSamples * 13]GT + for i := 0; i < 13; i++ { + copy(samplePointsLarge[i*nbSamples:], samplePoints[:]) + } + + var r16, splitted1, splitted2 GT + + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples * 13]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) + r16.MsmC16(samplePoints[:], scalars16, true) + + splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) + splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) + return r16.Equal(&splitted1) && r16.Equal(&splitted2) + }, + genScalar, + )) + + // we test only c = 5 and c = 16 + properties.Property("[GT] Multi exponentation (c=5, c=16) should be consistant with sum of square", prop.ForAll( + func(mixer fr.Element) bool { + + var expected, g GT + g.SetRandom() + // put into GT + g = FinalExponentiation(&_g) + + // compute expected result with double and add + var finalScalar, mixerBigInt big.Int + finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt)) + expected.ExpGLV(_g, &finalScalar) + + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + scalars5, _ := partitionScalars(sampleScalars[:], 5, false, runtime.NumCPU()) + scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) + + var r5, r16 GT + r5.MsmC5(samplePoints[:], scalars5, false) + r16.MsmC16(samplePoints[:], scalars16, true) + return (r5.Equal(&expected) && r16.Equal(&expected)) + }, + genScalar, + )) + + // note : this test is here as we expect to have a different multiExp than the above bucket method + // for small number of points + properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll( + func(mixer fr.Element) bool { + + var _g, g GT + _g.SetRandom() + + // put into GT + _g = FinalExponentiation(&_g) + + g.Set(&_g) + + // mixer ensures that all the words of a fpElement are set + samplePoints := make([]GT, 30) + sampleScalars := make([]fr.Element, 30) + + for i := 1; i <= 30; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + samplePoints[i-1].Set(&g) + g.Mul(&g, &_g) + } + + var op1MultiExp GT + op1MultiExp.MultiExp(samplePoints, sampleScalars, ecc.MultiExpConfig{}) + + var finalBigScalar fr.Element + var finalBigScalarBi big.Int + var op1ScalarMul GT + finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer) + finalBigScalar.ToBigIntRegular(&finalBigScalarBi) + op1ScalarMul.ExpGLV(_g, &finalBigScalarBi) + + return op1ScalarMul.Equal(&op1MultiExp) + }, + genScalar, + )) + + properties.TestingRun(t, gopter.ConsoleReporter(false)) +} + // ------------------------------------------------------------ // benches @@ -391,7 +532,6 @@ func BenchmarkExpGT(b *testing.B) { e.SetRandom() k := new(big.Int).SetUint64(6) - e.Exp(e, k) var _e big.Int e.ToBigIntRegular(&_e) @@ -417,3 +557,37 @@ func BenchmarkExpGT(b *testing.B) { } }) } + +func BenchmarkMultiExpGT(b *testing.B) { + // ensure every words of the scalars are filled + var mixer fr.Element + mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") + + const pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits + const nbSamples = 1 << pow + + var samplePoints [nbSamples]GT + var sampleScalars [nbSamples]fr.Element + var _g GT + _g.SetRandom() + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + samplePoints[i-1].Set(&_g) + } + + var testPoint GT + + for i := 5; i <= pow; i++ { + using := 1 << i + + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) + } +} diff --git a/ecc/bw6-756/internal/fptower/multiexp.go b/ecc/bw6-756/internal/fptower/multiexp.go new file mode 100644 index 0000000000..9179829638 --- /dev/null +++ b/ecc/bw6-756/internal/fptower/multiexp.go @@ -0,0 +1,569 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fptower + +import ( + "errors" + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bw6-756/fr" + "github.com/consensys/gnark-crypto/internal/parallel" + "math" + "runtime" +) + +/* Multi-Exponentiation à la Pippenger */ + +// selector stores the index, mask and shifts needed to select bits from a scalar +// it is used during the multiExp algorithm or the batch scalar multiplication +type selector struct { + index uint64 // index in the multi-word scalar to select bits from + mask uint64 // mask (c-bit wide) + shift uint64 // shift needed to get our bits on low positions + + multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) + maskHigh uint64 // same than mask, for index+1 + shiftHigh uint64 // same than shift, for index+1 +} + +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { + toReturn := make([]fr.Element, len(scalars)) + + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) + } + selectors[chunk] = d + } + + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) + + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int + + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.IsUint64() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } + + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] + + // init with carry if any + digit := carry + carry = 0 + + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + + // if digit is zero, no impact on result + if digit == 0 { + continue + } + + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } + + var bits uint64 + if digit >= 0 { + bits = uint64(digit) + } else { + bits = uint64(-digit-1) | msbWindow + } + + toReturn[i][s.index] |= (bits << s.shift) + if s.multiWordSelect { + toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues +} + +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +func (p *E6) MultiExp(points []E6, scalars []fr.Element, config ecc.MultiExpConfig) (*E6, error) { + // note: + // each of the MsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each MsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented MsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 8, 16} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerE6 , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]E6, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerE6(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerE6(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.Mul(p, &_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerE6(p *E6, c int, points []E6, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.MsmC4(points, scalars, splitFirstChunk) + + case 5: + p.MsmC5(points, scalars, splitFirstChunk) + + case 8: + p.MsmC8(points, scalars, splitFirstChunk) + + case 16: + p.MsmC16(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkE6 reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkE6(p *E6, c int, chChunks []chan E6) *E6 { + var _p E6 + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.CyclotomicSquare(&_p) + } + totalj := <-chChunks[j] + _p.Mul(&_p, &totalj) + } + + p.Set(&_p) + return p +} + +func msmProcessChunkE6(chunk uint64, + chRes chan<- E6, + buckets []E6, + c uint64, + points []E6, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].SetOne() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + var tmp E6 + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].Mul(&buckets[bits-1], &points[i]) + } else { + // sub + tmp.Conjugate(&points[i]) + buckets[bits & ^msbWindow].Mul(&buckets[bits & ^msbWindow], &tmp) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var one, runningSum, total E6 + runningSum.SetOne() + total.SetOne() + one.SetOne() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].Equal(&one) { + runningSum.Mul(&runningSum, &buckets[k]) + } + total.Mul(&total, &runningSum) + } + + chRes <- total + +} + +func (p *E6) MsmC4(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 { + const ( + c = 4 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E6 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E6, 1) + } + + processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) { + var buckets [1 << (c - 1)]E6 + msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E6, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE6(p, c, chChunks[:]) +} + +func (p *E6) MsmC5(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 { + const ( + c = 5 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E6 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E6, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E6, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E6 + msmProcessChunkE6(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) { + var buckets [1 << (c - 1)]E6 + msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E6, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE6(p, c, chChunks[:]) +} + +func (p *E6) MsmC8(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 { + const ( + c = 8 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E6 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E6, 1) + } + + processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) { + var buckets [1 << (c - 1)]E6 + msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E6, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE6(p, c, chChunks[:]) +} + +func (p *E6) MsmC16(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E6 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E6, 1) + } + + processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) { + var buckets [1 << (c - 1)]E6 + msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E6, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE6(p, c, chChunks[:]) +} diff --git a/ecc/bw6-756/pairing_test.go b/ecc/bw6-756/pairing_test.go index 5c111bbd45..bf3cdcc402 100644 --- a/ecc/bw6-756/pairing_test.go +++ b/ecc/bw6-756/pairing_test.go @@ -19,8 +19,11 @@ package bw6756 import ( "fmt" "math/big" + "math/bits" + "runtime" "testing" + "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-756/fp" "github.com/consensys/gnark-crypto/ecc/bw6-756/fr" "github.com/leanovate/gopter" @@ -43,7 +46,6 @@ func TestPairing(t *testing.T) { properties := gopter.NewProperties(parameters) genA := GenE6() - genR1 := GenFr() genR2 := GenFr() genP := GenFp() @@ -72,7 +74,7 @@ func TestPairing(t *testing.T) { var _e, ne big.Int - k := new(big.Int).SetUint64(12) + k := new(big.Int).SetUint64(6) e.Exp(e, k) e.ToBigIntRegular(&_e) ne.Neg(&_e) @@ -285,6 +287,146 @@ func TestMillerLoop(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestMultiExpGT(t *testing.T) { + + parameters := gopter.DefaultTestParameters() + parameters.MinSuccessfulTests = 2 + + properties := gopter.NewProperties(parameters) + + genScalar := GenFr() + + // size of the multiExps + const nbSamples = 143 + + // multi exp points + var samplePoints [nbSamples]GT + var _g, g GT + _g.SetRandom() + + // put into GT + _g = FinalExponentiation(&_g) + + g.Set(&_g) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].Set(&g) + g.Mul(&g, &_g) + } + + // final scalar to use in double and add method (without mixer factor) + // n(n+1)(2n+1)/6 (sum of the squares from 1 to n) + var scalar big.Int + scalar.SetInt64(nbSamples) + scalar.Mul(&scalar, new(big.Int).SetInt64(nbSamples+1)) + scalar.Mul(&scalar, new(big.Int).SetInt64(2*nbSamples+1)) + scalar.Div(&scalar, new(big.Int).SetInt64(6)) + + // ensure a multiexp that's splitted has the same result as a non-splitted one.. + properties.Property("[GT] Multi exponentation (c=16) should be consistant with splitted multiexp", prop.ForAll( + func(mixer fr.Element) bool { + var samplePointsLarge [nbSamples * 13]GT + for i := 0; i < 13; i++ { + copy(samplePointsLarge[i*nbSamples:], samplePoints[:]) + } + + var r16, splitted1, splitted2 GT + + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples * 13]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) + r16.MsmC16(samplePoints[:], scalars16, true) + + splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) + splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) + return r16.Equal(&splitted1) && r16.Equal(&splitted2) + }, + genScalar, + )) + + // we test only c = 5 and c = 16 + properties.Property("[GT] Multi exponentation (c=5, c=16) should be consistant with sum of square", prop.ForAll( + func(mixer fr.Element) bool { + + var expected, g GT + g.SetRandom() + // put into GT + g = FinalExponentiation(&_g) + + // compute expected result with double and add + var finalScalar, mixerBigInt big.Int + finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt)) + expected.ExpGLV(_g, &finalScalar) + + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + scalars5, _ := partitionScalars(sampleScalars[:], 5, false, runtime.NumCPU()) + scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) + + var r5, r16 GT + r5.MsmC5(samplePoints[:], scalars5, false) + r16.MsmC16(samplePoints[:], scalars16, true) + return (r5.Equal(&expected) && r16.Equal(&expected)) + }, + genScalar, + )) + + // note : this test is here as we expect to have a different multiExp than the above bucket method + // for small number of points + properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll( + func(mixer fr.Element) bool { + + var _g, g GT + _g.SetRandom() + + // put into GT + _g = FinalExponentiation(&_g) + + g.Set(&_g) + + // mixer ensures that all the words of a fpElement are set + samplePoints := make([]GT, 30) + sampleScalars := make([]fr.Element, 30) + + for i := 1; i <= 30; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + samplePoints[i-1].Set(&g) + g.Mul(&g, &_g) + } + + var op1MultiExp GT + op1MultiExp.MultiExp(samplePoints, sampleScalars, ecc.MultiExpConfig{}) + + var finalBigScalar fr.Element + var finalBigScalarBi big.Int + var op1ScalarMul GT + finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer) + finalBigScalar.ToBigIntRegular(&finalBigScalarBi) + op1ScalarMul.ExpGLV(_g, &finalBigScalarBi) + + return op1ScalarMul.Equal(&op1MultiExp) + }, + genScalar, + )) + + properties.TestingRun(t, gopter.ConsoleReporter(false)) +} + // ------------------------------------------------------------ // benches @@ -389,7 +531,7 @@ func BenchmarkExpGT(b *testing.B) { var e fp.Element e.SetRandom() - k := new(big.Int).SetUint64(12) + k := new(big.Int).SetUint64(6) e.Exp(e, k) var _e big.Int e.ToBigIntRegular(&_e) @@ -415,3 +557,37 @@ func BenchmarkExpGT(b *testing.B) { } }) } + +func BenchmarkMultiExpGT(b *testing.B) { + // ensure every words of the scalars are filled + var mixer fr.Element + mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") + + const pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits + const nbSamples = 1 << pow + + var samplePoints [nbSamples]GT + var sampleScalars [nbSamples]fr.Element + var _g GT + _g.SetRandom() + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + samplePoints[i-1].Set(&_g) + } + + var testPoint GT + + for i := 5; i <= pow; i++ { + using := 1 << i + + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) + } +} diff --git a/ecc/bw6-761/internal/fptower/multiexp.go b/ecc/bw6-761/internal/fptower/multiexp.go new file mode 100644 index 0000000000..97216f80f9 --- /dev/null +++ b/ecc/bw6-761/internal/fptower/multiexp.go @@ -0,0 +1,569 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fptower + +import ( + "errors" + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" + "github.com/consensys/gnark-crypto/internal/parallel" + "math" + "runtime" +) + +/* Multi-Exponentiation à la Pippenger */ + +// selector stores the index, mask and shifts needed to select bits from a scalar +// it is used during the multiExp algorithm or the batch scalar multiplication +type selector struct { + index uint64 // index in the multi-word scalar to select bits from + mask uint64 // mask (c-bit wide) + shift uint64 // shift needed to get our bits on low positions + + multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) + maskHigh uint64 // same than mask, for index+1 + shiftHigh uint64 // same than shift, for index+1 +} + +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { + toReturn := make([]fr.Element, len(scalars)) + + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) + } + selectors[chunk] = d + } + + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) + + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int + + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.IsUint64() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } + + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] + + // init with carry if any + digit := carry + carry = 0 + + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + + // if digit is zero, no impact on result + if digit == 0 { + continue + } + + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } + + var bits uint64 + if digit >= 0 { + bits = uint64(digit) + } else { + bits = uint64(-digit-1) | msbWindow + } + + toReturn[i][s.index] |= (bits << s.shift) + if s.multiWordSelect { + toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues +} + +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +func (p *E6) MultiExp(points []E6, scalars []fr.Element, config ecc.MultiExpConfig) (*E6, error) { + // note: + // each of the MsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each MsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented MsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 8, 16} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerE6 , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]E6, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerE6(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerE6(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.Mul(p, &_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerE6(p *E6, c int, points []E6, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.MsmC4(points, scalars, splitFirstChunk) + + case 5: + p.MsmC5(points, scalars, splitFirstChunk) + + case 8: + p.MsmC8(points, scalars, splitFirstChunk) + + case 16: + p.MsmC16(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkE6 reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkE6(p *E6, c int, chChunks []chan E6) *E6 { + var _p E6 + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.CyclotomicSquare(&_p) + } + totalj := <-chChunks[j] + _p.Mul(&_p, &totalj) + } + + p.Set(&_p) + return p +} + +func msmProcessChunkE6(chunk uint64, + chRes chan<- E6, + buckets []E6, + c uint64, + points []E6, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].SetOne() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + var tmp E6 + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].Mul(&buckets[bits-1], &points[i]) + } else { + // sub + tmp.Conjugate(&points[i]) + buckets[bits & ^msbWindow].Mul(&buckets[bits & ^msbWindow], &tmp) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var one, runningSum, total E6 + runningSum.SetOne() + total.SetOne() + one.SetOne() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].Equal(&one) { + runningSum.Mul(&runningSum, &buckets[k]) + } + total.Mul(&total, &runningSum) + } + + chRes <- total + +} + +func (p *E6) MsmC4(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 { + const ( + c = 4 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E6 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E6, 1) + } + + processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) { + var buckets [1 << (c - 1)]E6 + msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E6, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE6(p, c, chChunks[:]) +} + +func (p *E6) MsmC5(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 { + const ( + c = 5 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks + 1]chan E6 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E6, 1) + } + + // c doesn't divide 256, last window is smaller we can allocate less buckets + const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) + go func(j uint64, points []E6, scalars []fr.Element) { + var buckets [1 << (lastC - 1)]E6 + msmProcessChunkE6(j, chChunks[j], buckets[:], c, points, scalars) + }(uint64(nbChunks), points, scalars) + + processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) { + var buckets [1 << (c - 1)]E6 + msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E6, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE6(p, c, chChunks[:]) +} + +func (p *E6) MsmC8(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 { + const ( + c = 8 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E6 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E6, 1) + } + + processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) { + var buckets [1 << (c - 1)]E6 + msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E6, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE6(p, c, chChunks[:]) +} + +func (p *E6) MsmC16(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E6 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E6, 1) + } + + processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) { + var buckets [1 << (c - 1)]E6 + msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E6, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE6(p, c, chChunks[:]) +} diff --git a/ecc/bw6-761/pairing_test.go b/ecc/bw6-761/pairing_test.go index 76bf81eb3a..436e6ed4d5 100644 --- a/ecc/bw6-761/pairing_test.go +++ b/ecc/bw6-761/pairing_test.go @@ -19,8 +19,11 @@ package bw6761 import ( "fmt" "math/big" + "math/bits" + "runtime" "testing" + "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bw6-761/fp" "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" "github.com/leanovate/gopter" @@ -43,7 +46,6 @@ func TestPairing(t *testing.T) { properties := gopter.NewProperties(parameters) genA := GenE6() - genR1 := GenFr() genR2 := GenFr() genP := GenFp() @@ -73,7 +75,6 @@ func TestPairing(t *testing.T) { var _e, ne big.Int k := new(big.Int).SetUint64(6) - e.Exp(e, k) e.ToBigIntRegular(&_e) ne.Neg(&_e) @@ -286,6 +287,146 @@ func TestMillerLoop(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestMultiExpGT(t *testing.T) { + + parameters := gopter.DefaultTestParameters() + parameters.MinSuccessfulTests = 2 + + properties := gopter.NewProperties(parameters) + + genScalar := GenFr() + + // size of the multiExps + const nbSamples = 143 + + // multi exp points + var samplePoints [nbSamples]GT + var _g, g GT + _g.SetRandom() + + // put into GT + _g = FinalExponentiation(&_g) + + g.Set(&_g) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].Set(&g) + g.Mul(&g, &_g) + } + + // final scalar to use in double and add method (without mixer factor) + // n(n+1)(2n+1)/6 (sum of the squares from 1 to n) + var scalar big.Int + scalar.SetInt64(nbSamples) + scalar.Mul(&scalar, new(big.Int).SetInt64(nbSamples+1)) + scalar.Mul(&scalar, new(big.Int).SetInt64(2*nbSamples+1)) + scalar.Div(&scalar, new(big.Int).SetInt64(6)) + + // ensure a multiexp that's splitted has the same result as a non-splitted one.. + properties.Property("[GT] Multi exponentation (c=16) should be consistant with splitted multiexp", prop.ForAll( + func(mixer fr.Element) bool { + var samplePointsLarge [nbSamples * 13]GT + for i := 0; i < 13; i++ { + copy(samplePointsLarge[i*nbSamples:], samplePoints[:]) + } + + var r16, splitted1, splitted2 GT + + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples * 13]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) + r16.MsmC16(samplePoints[:], scalars16, true) + + splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) + splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) + return r16.Equal(&splitted1) && r16.Equal(&splitted2) + }, + genScalar, + )) + + // we test only c = 5 and c = 16 + properties.Property("[GT] Multi exponentation (c=5, c=16) should be consistant with sum of square", prop.ForAll( + func(mixer fr.Element) bool { + + var expected, g GT + g.SetRandom() + // put into GT + g = FinalExponentiation(&_g) + + // compute expected result with double and add + var finalScalar, mixerBigInt big.Int + finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt)) + expected.ExpGLV(_g, &finalScalar) + + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + scalars5, _ := partitionScalars(sampleScalars[:], 5, false, runtime.NumCPU()) + scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) + + var r5, r16 GT + r5.MsmC5(samplePoints[:], scalars5, false) + r16.MsmC16(samplePoints[:], scalars16, true) + return (r5.Equal(&expected) && r16.Equal(&expected)) + }, + genScalar, + )) + + // note : this test is here as we expect to have a different multiExp than the above bucket method + // for small number of points + properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll( + func(mixer fr.Element) bool { + + var _g, g GT + _g.SetRandom() + + // put into GT + _g = FinalExponentiation(&_g) + + g.Set(&_g) + + // mixer ensures that all the words of a fpElement are set + samplePoints := make([]GT, 30) + sampleScalars := make([]fr.Element, 30) + + for i := 1; i <= 30; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + samplePoints[i-1].Set(&g) + g.Mul(&g, &_g) + } + + var op1MultiExp GT + op1MultiExp.MultiExp(samplePoints, sampleScalars, ecc.MultiExpConfig{}) + + var finalBigScalar fr.Element + var finalBigScalarBi big.Int + var op1ScalarMul GT + finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer) + finalBigScalar.ToBigIntRegular(&finalBigScalarBi) + op1ScalarMul.ExpGLV(_g, &finalBigScalarBi) + + return op1ScalarMul.Equal(&op1MultiExp) + }, + genScalar, + )) + + properties.TestingRun(t, gopter.ConsoleReporter(false)) +} + // ------------------------------------------------------------ // benches @@ -391,7 +532,6 @@ func BenchmarkExpGT(b *testing.B) { e.SetRandom() k := new(big.Int).SetUint64(6) - e.Exp(e, k) var _e big.Int e.ToBigIntRegular(&_e) @@ -417,3 +557,37 @@ func BenchmarkExpGT(b *testing.B) { } }) } + +func BenchmarkMultiExpGT(b *testing.B) { + // ensure every words of the scalars are filled + var mixer fr.Element + mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") + + const pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits + const nbSamples = 1 << pow + + var samplePoints [nbSamples]GT + var sampleScalars [nbSamples]fr.Element + var _g GT + _g.SetRandom() + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + samplePoints[i-1].Set(&_g) + } + + var testPoint GT + + for i := 5; i <= pow; i++ { + using := 1 << i + + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) + } +} From 8ad6473f4dad12dae00298de3994a234989f6122 Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Mon, 20 Jun 2022 18:14:55 +0100 Subject: [PATCH 15/16] refactor: code generation of MultiExp test in pairing_test --- ecc/bls12-378/pairing_test.go | 5 +- ecc/bls24-315/pairing_test.go | 5 +- ecc/bls24-317/pairing_test.go | 3 +- ecc/bw6-633/pairing_test.go | 3 + ecc/bw6-756/pairing_test.go | 4 +- ecc/bw6-761/pairing_test.go | 3 + .../pairing/template/tests/pairing.go.tmpl | 181 +++++++++++++++++- 7 files changed, 197 insertions(+), 7 deletions(-) diff --git a/ecc/bls12-378/pairing_test.go b/ecc/bls12-378/pairing_test.go index 0bae6e1e41..90bc6ab07f 100644 --- a/ecc/bls12-378/pairing_test.go +++ b/ecc/bls12-378/pairing_test.go @@ -286,6 +286,7 @@ func TestMillerLoop(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } + func TestMultiExpGT(t *testing.T) { parameters := gopter.DefaultTestParameters() @@ -361,7 +362,7 @@ func TestMultiExpGT(t *testing.T) { // compute expected result with double and add var finalScalar, mixerBigInt big.Int finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt)) - expected.ExpGLV(&_g, &finalScalar) + expected.ExpGLV(_g, &finalScalar) // mixer ensures that all the words of a fpElement are set var sampleScalars [nbSamples]fr.Element @@ -416,7 +417,7 @@ func TestMultiExpGT(t *testing.T) { var op1ScalarMul GT finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer) finalBigScalar.ToBigIntRegular(&finalBigScalarBi) - op1ScalarMul.ExpGLV(&_g, &finalBigScalarBi) + op1ScalarMul.ExpGLV(_g, &finalBigScalarBi) return op1ScalarMul.Equal(&op1MultiExp) }, diff --git a/ecc/bls24-315/pairing_test.go b/ecc/bls24-315/pairing_test.go index 1663205050..d14bb6e4c7 100644 --- a/ecc/bls24-315/pairing_test.go +++ b/ecc/bls24-315/pairing_test.go @@ -46,6 +46,7 @@ func TestPairing(t *testing.T) { properties := gopter.NewProperties(parameters) genA := GenE24() + genR1 := GenFr() genR2 := GenFr() genP := GenFp() @@ -75,6 +76,7 @@ func TestPairing(t *testing.T) { var _e, ne big.Int k := new(big.Int).SetUint64(24) + e.Exp(e, k) e.ToBigIntRegular(&_e) ne.Neg(&_e) @@ -531,7 +533,8 @@ func BenchmarkExpGT(b *testing.B) { var e fp.Element e.SetRandom() - k := new(big.Int).SetUint64(12) + k := new(big.Int).SetUint64(24) + e.Exp(e, k) var _e big.Int e.ToBigIntRegular(&_e) diff --git a/ecc/bls24-317/pairing_test.go b/ecc/bls24-317/pairing_test.go index 55b48fe256..3889f1e6a2 100644 --- a/ecc/bls24-317/pairing_test.go +++ b/ecc/bls24-317/pairing_test.go @@ -46,6 +46,7 @@ func TestPairing(t *testing.T) { properties := gopter.NewProperties(parameters) genA := GenE24() + genR1 := GenFr() genR2 := GenFr() genP := GenFp() @@ -531,7 +532,7 @@ func BenchmarkExpGT(b *testing.B) { var e fp.Element e.SetRandom() - k := new(big.Int).SetUint64(24) + k := new(big.Int).SetUint64(12) e.Exp(e, k) var _e big.Int e.ToBigIntRegular(&_e) diff --git a/ecc/bw6-633/pairing_test.go b/ecc/bw6-633/pairing_test.go index e46b0086c1..6c39eb3be6 100644 --- a/ecc/bw6-633/pairing_test.go +++ b/ecc/bw6-633/pairing_test.go @@ -46,6 +46,7 @@ func TestPairing(t *testing.T) { properties := gopter.NewProperties(parameters) genA := GenE6() + genR1 := GenFr() genR2 := GenFr() genP := GenFp() @@ -75,6 +76,7 @@ func TestPairing(t *testing.T) { var _e, ne big.Int k := new(big.Int).SetUint64(6) + e.Exp(e, k) e.ToBigIntRegular(&_e) ne.Neg(&_e) @@ -532,6 +534,7 @@ func BenchmarkExpGT(b *testing.B) { e.SetRandom() k := new(big.Int).SetUint64(6) + e.Exp(e, k) var _e big.Int e.ToBigIntRegular(&_e) diff --git a/ecc/bw6-756/pairing_test.go b/ecc/bw6-756/pairing_test.go index bf3cdcc402..ae9dd418b3 100644 --- a/ecc/bw6-756/pairing_test.go +++ b/ecc/bw6-756/pairing_test.go @@ -46,6 +46,7 @@ func TestPairing(t *testing.T) { properties := gopter.NewProperties(parameters) genA := GenE6() + genR1 := GenFr() genR2 := GenFr() genP := GenFp() @@ -74,7 +75,7 @@ func TestPairing(t *testing.T) { var _e, ne big.Int - k := new(big.Int).SetUint64(6) + k := new(big.Int).SetUint64(12) e.Exp(e, k) e.ToBigIntRegular(&_e) ne.Neg(&_e) @@ -532,6 +533,7 @@ func BenchmarkExpGT(b *testing.B) { e.SetRandom() k := new(big.Int).SetUint64(6) + e.Exp(e, k) var _e big.Int e.ToBigIntRegular(&_e) diff --git a/ecc/bw6-761/pairing_test.go b/ecc/bw6-761/pairing_test.go index 436e6ed4d5..6c0219ba3f 100644 --- a/ecc/bw6-761/pairing_test.go +++ b/ecc/bw6-761/pairing_test.go @@ -46,6 +46,7 @@ func TestPairing(t *testing.T) { properties := gopter.NewProperties(parameters) genA := GenE6() + genR1 := GenFr() genR2 := GenFr() genP := GenFp() @@ -75,6 +76,7 @@ func TestPairing(t *testing.T) { var _e, ne big.Int k := new(big.Int).SetUint64(6) + e.Exp(e, k) e.ToBigIntRegular(&_e) ne.Neg(&_e) @@ -532,6 +534,7 @@ func BenchmarkExpGT(b *testing.B) { e.SetRandom() k := new(big.Int).SetUint64(6) + e.Exp(e, k) var _e big.Int e.ToBigIntRegular(&_e) diff --git a/internal/generator/pairing/template/tests/pairing.go.tmpl b/internal/generator/pairing/template/tests/pairing.go.tmpl index a8209b7ac2..3a020e2889 100644 --- a/internal/generator/pairing/template/tests/pairing.go.tmpl +++ b/internal/generator/pairing/template/tests/pairing.go.tmpl @@ -1,8 +1,11 @@ import ( "fmt" "math/big" + "math/bits" + "runtime" "testing" + "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/{{.Name}}/fr" "github.com/consensys/gnark-crypto/ecc/{{.Name}}/fp" "github.com/leanovate/gopter" @@ -286,6 +289,146 @@ func TestMillerLoop(t *testing.T) { properties.TestingRun(t, gopter.ConsoleReporter(false)) } +func TestMultiExpGT(t *testing.T) { + + parameters := gopter.DefaultTestParameters() + parameters.MinSuccessfulTests = 2 + + properties := gopter.NewProperties(parameters) + + genScalar := GenFr() + + // size of the multiExps + const nbSamples = 143 + + // multi exp points + var samplePoints [nbSamples]GT + var _g, g GT + _g.SetRandom() + + // put into GT + _g = FinalExponentiation(&_g) + + g.Set(&_g) + for i := 1; i <= nbSamples; i++ { + samplePoints[i-1].Set(&g) + g.Mul(&g, &_g) + } + + // final scalar to use in double and add method (without mixer factor) + // n(n+1)(2n+1)/6 (sum of the squares from 1 to n) + var scalar big.Int + scalar.SetInt64(nbSamples) + scalar.Mul(&scalar, new(big.Int).SetInt64(nbSamples+1)) + scalar.Mul(&scalar, new(big.Int).SetInt64(2*nbSamples+1)) + scalar.Div(&scalar, new(big.Int).SetInt64(6)) + + // ensure a multiexp that's splitted has the same result as a non-splitted one.. + properties.Property("[GT] Multi exponentation (c=16) should be consistant with splitted multiexp", prop.ForAll( + func(mixer fr.Element) bool { + var samplePointsLarge [nbSamples * 13]GT + for i := 0; i < 13; i++ { + copy(samplePointsLarge[i*nbSamples:], samplePoints[:]) + } + + var r16, splitted1, splitted2 GT + + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples * 13]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) + r16.MsmC16(samplePoints[:], scalars16, true) + + splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128}) + splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51}) + return r16.Equal(&splitted1) && r16.Equal(&splitted2) + }, + genScalar, + )) + + // we test only c = 5 and c = 16 + properties.Property("[GT] Multi exponentation (c=5, c=16) should be consistant with sum of square", prop.ForAll( + func(mixer fr.Element) bool { + + var expected, g GT + g.SetRandom() + // put into GT + g = FinalExponentiation(&_g) + + // compute expected result with double and add + var finalScalar, mixerBigInt big.Int + finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt)) + expected.ExpGLV(_g, &finalScalar) + + // mixer ensures that all the words of a fpElement are set + var sampleScalars [nbSamples]fr.Element + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + } + + scalars5, _ := partitionScalars(sampleScalars[:], 5, false, runtime.NumCPU()) + scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU()) + + var r5, r16 GT + r5.MsmC5(samplePoints[:], scalars5, false) + r16.MsmC16(samplePoints[:], scalars16, true) + return (r5.Equal(&expected) && r16.Equal(&expected)) + }, + genScalar, + )) + + // note : this test is here as we expect to have a different multiExp than the above bucket method + // for small number of points + properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll( + func(mixer fr.Element) bool { + + var _g, g GT + _g.SetRandom() + + // put into GT + _g = FinalExponentiation(&_g) + + g.Set(&_g) + + // mixer ensures that all the words of a fpElement are set + samplePoints := make([]GT, 30) + sampleScalars := make([]fr.Element, 30) + + for i := 1; i <= 30; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + samplePoints[i-1].Set(&g) + g.Mul(&g, &_g) + } + + var op1MultiExp GT + op1MultiExp.MultiExp(samplePoints, sampleScalars, ecc.MultiExpConfig{}) + + var finalBigScalar fr.Element + var finalBigScalarBi big.Int + var op1ScalarMul GT + finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer) + finalBigScalar.ToBigIntRegular(&finalBigScalarBi) + op1ScalarMul.ExpGLV(_g, &finalBigScalarBi) + + return op1ScalarMul.Equal(&op1MultiExp) + }, + genScalar, + )) + + properties.TestingRun(t, gopter.ConsoleReporter(false)) +} + // ------------------------------------------------------------ // benches @@ -390,9 +533,9 @@ func BenchmarkExpGT(b *testing.B) { var e fp.Element e.SetRandom() - {{if or (eq .Name "bw6-761") (eq .Name "bw6-633")}} + {{if or (eq .Name "bw6-761") (eq .Name "bw6-633") (eq .Name "bw6-756")}} k := new(big.Int).SetUint64(6) - {{else if eq .Name "bls24-315"}} + {{else if or (eq .Name "bls24-315") (eq .Name "bls24-315")}} k := new(big.Int).SetUint64(24) {{ else }} k := new(big.Int).SetUint64(12) @@ -422,3 +565,37 @@ func BenchmarkExpGT(b *testing.B) { } }) } + +func BenchmarkMultiExpGT(b *testing.B) { + // ensure every words of the scalars are filled + var mixer fr.Element + mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487") + + const pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits + const nbSamples = 1 << pow + + var samplePoints [nbSamples]GT + var sampleScalars [nbSamples]fr.Element + var _g GT + _g.SetRandom() + + for i := 1; i <= nbSamples; i++ { + sampleScalars[i-1].SetUint64(uint64(i)). + Mul(&sampleScalars[i-1], &mixer). + FromMont() + samplePoints[i-1].Set(&_g) + } + + var testPoint GT + + for i := 5; i <= pow; i++ { + using := 1 << i + + b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) { + b.ResetTimer() + for j := 0; j < b.N; j++ { + testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{}) + } + }) + } +} From d890c8d62588cd374e3f77f8cc01ca8d43d92b51 Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Tue, 21 Jun 2022 10:00:52 +0100 Subject: [PATCH 16/16] fix(bw6-633/GT): GT-MSM with widown size 5 --- ecc/bw6-633/internal/fptower/multiexp.go | 562 +++++++++++++++++++++++ ecc/bw6-756/internal/fptower/multiexp.go | 2 +- ecc/bw6-761/internal/fptower/multiexp.go | 2 +- 3 files changed, 564 insertions(+), 2 deletions(-) create mode 100644 ecc/bw6-633/internal/fptower/multiexp.go diff --git a/ecc/bw6-633/internal/fptower/multiexp.go b/ecc/bw6-633/internal/fptower/multiexp.go new file mode 100644 index 0000000000..694a74dc47 --- /dev/null +++ b/ecc/bw6-633/internal/fptower/multiexp.go @@ -0,0 +1,562 @@ +// Copyright 2020 ConsenSys Software Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by consensys/gnark-crypto DO NOT EDIT + +package fptower + +import ( + "errors" + "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/ecc/bw6-633/fr" + "github.com/consensys/gnark-crypto/internal/parallel" + "math" + "runtime" +) + +/* Multi-Exponentiation à la Pippenger */ + +// selector stores the index, mask and shifts needed to select bits from a scalar +// it is used during the multiExp algorithm or the batch scalar multiplication +type selector struct { + index uint64 // index in the multi-word scalar to select bits from + mask uint64 // mask (c-bit wide) + shift uint64 // shift needed to get our bits on low positions + + multiWordSelect bool // set to true if we need to select bits from 2 words (case where c doesn't divide 64) + maskHigh uint64 // same than mask, for index+1 + shiftHigh uint64 // same than shift, for index+1 +} + +// partitionScalars compute, for each scalars over c-bit wide windows, nbChunk digits +// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract +// 2^{c} to the current digit, making it negative. +// negative digits can be processed in a later step as adding -G into the bucket instead of G +// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul) +// scalarsMont indicates wheter the provided scalars are in montgomery form +// returns smallValues, which represent the number of scalars which meets the following condition +// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero) +func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) { + toReturn := make([]fr.Element, len(scalars)) + + // number of c-bit radixes in a scalar + nbChunks := fr.Limbs * 64 / c + if (fr.Limbs*64)%c != 0 { + nbChunks++ + } + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window + max := int(1 << (c - 1)) // max value we want for our digits + cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words + + // compute offset and word selector / shift to select the right bits of our windows + selectors := make([]selector, nbChunks) + for chunk := uint64(0); chunk < nbChunks; chunk++ { + jc := uint64(chunk * c) + d := selector{} + d.index = jc / 64 + d.shift = jc - (d.index * 64) + d.mask = mask << d.shift + d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1) + if d.multiWordSelect { + nbBitsHigh := d.shift - uint64(64-c) + d.maskHigh = (1 << nbBitsHigh) - 1 + d.shiftHigh = (c - nbBitsHigh) + } + selectors[chunk] = d + } + + // for each chunk, we could track the number of non-zeros points we will need to process + // this way, if a chunk has more work to do than others, we can spawn off more go routines + // (at the cost of more buckets allocated) + // a simplified approach is to track the small values where only the first word is set + // if this number represent a significant number of points, then we will split first chunk + // processing in the msm in 2, to ensure all go routines finish at ~same time + // /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine + // if it does, though, this will deadlocK. + chSmallValues := make(chan int, nbTasks) + + parallel.Execute(len(scalars), func(start, end int) { + smallValues := 0 + for i := start; i < end; i++ { + var carry int + + scalar := scalars[i] + if scalarsMont { + scalar.FromMont() + } + if scalar.IsUint64() { + // everything is 0, no need to process this scalar + if scalar[0] == 0 { + continue + } + // low c-bits are 1 in mask + if scalar[0]&mask == scalar[0] { + smallValues++ + } + } + + // for each chunk in the scalar, compute the current digit, and an eventual carry + for chunk := uint64(0); chunk < nbChunks; chunk++ { + s := selectors[chunk] + + // init with carry if any + digit := carry + carry = 0 + + // digit = value of the c-bit window + digit += int((scalar[s.index] & s.mask) >> s.shift) + + if s.multiWordSelect { + // we are selecting bits over 2 words + digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh + } + + // if digit is zero, no impact on result + if digit == 0 { + continue + } + + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + if digit >= max { + digit -= (1 << c) + carry = 1 + } + + var bits uint64 + if digit >= 0 { + bits = uint64(digit) + } else { + bits = uint64(-digit-1) | msbWindow + } + + toReturn[i][s.index] |= (bits << s.shift) + if s.multiWordSelect { + toReturn[i][s.index+1] |= (bits >> s.shiftHigh) + } + + } + } + + chSmallValues <- smallValues + + }, nbTasks) + + // aggregate small values + close(chSmallValues) + smallValues := 0 + for o := range chSmallValues { + smallValues += o + } + return toReturn, smallValues +} + +// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf +func (p *E6) MultiExp(points []E6, scalars []fr.Element, config ecc.MultiExpConfig) (*E6, error) { + // note: + // each of the MsmCX method is the same, except for the c constant it declares + // duplicating (through template generation) these methods allows to declare the buckets on the stack + // the choice of c needs to be improved: + // there is a theoritical value that gives optimal asymptotics + // but in practice, other factors come into play, including: + // * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1 + // * number of CPUs + // * cache friendliness (which depends on the host, G1 or G2... ) + // --> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't. + + // for each MsmCX + // step 1 + // we compute, for each scalars over c-bit wide windows, nbChunk digits + // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract + // 2^{c} to the current digit, making it negative. + // negative digits will be processed in the next step as adding -G into the bucket instead of G + // (computing -G is cheap, and this saves us half of the buckets) + // step 2 + // buckets are declared on the stack + // notice that we have 2^{c-1} buckets instead of 2^{c} (see step1) + // we use jacobian extended formulas here as they are faster than mixed addition + // msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel + // step 3 + // reduce the buckets weigthed sums into our result (msmReduceChunk) + + // ensure len(points) == len(scalars) + nbPoints := len(points) + if nbPoints != len(scalars) { + return nil, errors.New("len(points) != len(scalars)") + } + + // if nbTasks is not set, use all available CPUs + if config.NbTasks <= 0 { + config.NbTasks = runtime.NumCPU() + } + + // here, we compute the best C for nbPoints + // we split recursively until nbChunks(c) >= nbTasks, + bestC := func(nbPoints int) uint64 { + // implemented MsmC methods (the c we use must be in this slice) + implementedCs := []uint64{4, 5, 8, 16} + var C uint64 + // approximate cost (in group operations) + // cost = bits/c * (nbPoints + 2^{c}) + // this needs to be verified empirically. + // for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results + min := math.MaxFloat64 + for _, c := range implementedCs { + cc := fr.Limbs * 64 * (nbPoints + (1 << (c))) + cost := float64(cc) / float64(c) + if cost < min { + min = cost + C = c + } + } + // empirical, needs to be tuned. + // if C > 16 && nbPoints < 1 << 23 { + // C = 16 + // } + return C + } + + var C uint64 + nbSplits := 1 + nbChunks := 0 + for nbChunks < config.NbTasks { + C = bestC(nbPoints) + nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar + if (fr.Limbs*64)%C != 0 { + nbChunks++ + } + nbChunks *= nbSplits + if nbChunks < config.NbTasks { + nbSplits <<= 1 + nbPoints >>= 1 + } + } + + // partition the scalars + // note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW) + // if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window + var smallValues int + scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks) + + // if we have more than 10% of small values, we split the processing of the first chunk in 2 + // we may want to do that in msmInnerE6 , but that would incur a cost of looping through all scalars one more time + splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1 + + // we have nbSplits intermediate results that we must sum together. + _p := make([]E6, nbSplits-1) + chDone := make(chan int, nbSplits-1) + for i := 0; i < nbSplits-1; i++ { + start := i * nbPoints + end := start + nbPoints + go func(start, end, i int) { + msmInnerE6(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk) + chDone <- i + }(start, end, i) + } + + msmInnerE6(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk) + for i := 0; i < nbSplits-1; i++ { + done := <-chDone + p.Mul(p, &_p[done]) + } + close(chDone) + return p, nil +} + +func msmInnerE6(p *E6, c int, points []E6, scalars []fr.Element, splitFirstChunk bool) { + + switch c { + + case 4: + p.MsmC4(points, scalars, splitFirstChunk) + + case 5: + p.MsmC5(points, scalars, splitFirstChunk) + + case 8: + p.MsmC8(points, scalars, splitFirstChunk) + + case 16: + p.MsmC16(points, scalars, splitFirstChunk) + + default: + panic("not implemented") + } +} + +// msmReduceChunkE6 reduces the weighted sum of the buckets into the result of the multiExp +func msmReduceChunkE6(p *E6, c int, chChunks []chan E6) *E6 { + var _p E6 + totalj := <-chChunks[len(chChunks)-1] + _p.Set(&totalj) + for j := len(chChunks) - 2; j >= 0; j-- { + for l := 0; l < c; l++ { + _p.CyclotomicSquare(&_p) + } + totalj := <-chChunks[j] + _p.Mul(&_p, &totalj) + } + + p.Set(&_p) + return p +} + +func msmProcessChunkE6(chunk uint64, + chRes chan<- E6, + buckets []E6, + c uint64, + points []E6, + scalars []fr.Element) { + + mask := uint64((1 << c) - 1) // low c bits are 1 + msbWindow := uint64(1 << (c - 1)) + + for i := 0; i < len(buckets); i++ { + buckets[i].SetOne() + } + + jc := uint64(chunk * c) + s := selector{} + s.index = jc / 64 + s.shift = jc - (s.index * 64) + s.mask = mask << s.shift + s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1) + if s.multiWordSelect { + nbBitsHigh := s.shift - uint64(64-c) + s.maskHigh = (1 << nbBitsHigh) - 1 + s.shiftHigh = (c - nbBitsHigh) + } + + // for each scalars, get the digit corresponding to the chunk we're processing. + for i := 0; i < len(scalars); i++ { + bits := (scalars[i][s.index] & s.mask) >> s.shift + if s.multiWordSelect { + bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh + } + + if bits == 0 { + continue + } + + var tmp E6 + // if msbWindow bit is set, we need to substract + if bits&msbWindow == 0 { + // add + buckets[bits-1].Mul(&buckets[bits-1], &points[i]) + } else { + // sub + tmp.Conjugate(&points[i]) + buckets[bits & ^msbWindow].Mul(&buckets[bits & ^msbWindow], &tmp) + } + } + + // reduce buckets into total + // total = bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1] + + var one, runningSum, total E6 + runningSum.SetOne() + total.SetOne() + one.SetOne() + for k := len(buckets) - 1; k >= 0; k-- { + if !buckets[k].Equal(&one) { + runningSum.Mul(&runningSum, &buckets[k]) + } + total.Mul(&total, &runningSum) + } + + chRes <- total + +} + +func (p *E6) MsmC4(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 { + const ( + c = 4 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E6 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E6, 1) + } + + processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) { + var buckets [1 << (c - 1)]E6 + msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E6, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE6(p, c, chChunks[:]) +} + +func (p *E6) MsmC5(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 { + const ( + c = 5 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E6 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E6, 1) + } + + processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) { + var buckets [1 << (c - 1)]E6 + msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E6, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE6(p, c, chChunks[:]) +} + +func (p *E6) MsmC8(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 { + const ( + c = 8 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E6 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E6, 1) + } + + processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) { + var buckets [1 << (c - 1)]E6 + msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E6, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE6(p, c, chChunks[:]) +} + +func (p *E6) MsmC16(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 { + const ( + c = 16 // scalars partitioned into c-bit radixes + nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar + ) + + // for each chunk, spawn one go routine that'll loop through all the scalars in the + // corresponding bit-window + // note that buckets is an array allocated on the stack (for most sizes of c) and this is + // critical for performance + + // each go routine sends its result in chChunks[i] channel + var chChunks [nbChunks]chan E6 + for i := 0; i < len(chChunks); i++ { + chChunks[i] = make(chan E6, 1) + } + + processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) { + var buckets [1 << (c - 1)]E6 + msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars) + } + + for j := int(nbChunks - 1); j > 0; j-- { + go processChunk(j, points, scalars, chChunks[j]) + } + + if !splitFirstChunk { + go processChunk(0, points, scalars, chChunks[0]) + } else { + chSplit := make(chan E6, 2) + split := len(points) / 2 + go processChunk(0, points[:split], scalars[:split], chSplit) + go processChunk(0, points[split:], scalars[split:], chSplit) + go func() { + s1 := <-chSplit + s2 := <-chSplit + close(chSplit) + s1.Mul(&s1, &s2) + chChunks[0] <- s1 + }() + } + + return msmReduceChunkE6(p, c, chChunks[:]) +} diff --git a/ecc/bw6-756/internal/fptower/multiexp.go b/ecc/bw6-756/internal/fptower/multiexp.go index 9179829638..cc82237c8a 100644 --- a/ecc/bw6-756/internal/fptower/multiexp.go +++ b/ecc/bw6-756/internal/fptower/multiexp.go @@ -443,7 +443,7 @@ func (p *E6) MsmC5(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 chChunks[i] = make(chan E6, 1) } - // c doesn't divide 256, last window is smaller we can allocate less buckets + // c doesn't divide 384, last window is smaller we can allocate less buckets const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) go func(j uint64, points []E6, scalars []fr.Element) { var buckets [1 << (lastC - 1)]E6 diff --git a/ecc/bw6-761/internal/fptower/multiexp.go b/ecc/bw6-761/internal/fptower/multiexp.go index 97216f80f9..a6996edbf8 100644 --- a/ecc/bw6-761/internal/fptower/multiexp.go +++ b/ecc/bw6-761/internal/fptower/multiexp.go @@ -443,7 +443,7 @@ func (p *E6) MsmC5(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 chChunks[i] = make(chan E6, 1) } - // c doesn't divide 256, last window is smaller we can allocate less buckets + // c doesn't divide 384, last window is smaller we can allocate less buckets const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c)) go func(j uint64, points []E6, scalars []fr.Element) { var buckets [1 << (lastC - 1)]E6