From d90c332cf8dbe4c209aaedc350e830ac6a6bfb9f Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Tue, 4 Jan 2022 17:16:11 +0100
Subject: [PATCH 01/16] feat(GT): exponentiation in the cyclotomic subgroups of
 E6, E12, E24

---
 ecc/bls12-377/internal/fptower/e12.go         | 21 ++++++++++++++++
 ecc/bls12-377/internal/fptower/e12_test.go    | 25 +++++++++++++++++++
 ecc/bls12-381/internal/fptower/e12.go         | 21 ++++++++++++++++
 ecc/bls12-381/internal/fptower/e12_test.go    | 25 +++++++++++++++++++
 ecc/bls24-315/internal/fptower/e24.go         | 21 ++++++++++++++++
 ecc/bls24-315/internal/fptower/e24_test.go    | 25 +++++++++++++++++++
 ecc/bn254/internal/fptower/e12.go             | 21 ++++++++++++++++
 ecc/bn254/internal/fptower/e12_test.go        | 25 +++++++++++++++++++
 ecc/bw6-633/internal/fptower/e6.go            | 21 ++++++++++++++++
 ecc/bw6-633/internal/fptower/e6_test.go       | 25 +++++++++++++++++++
 ecc/bw6-761/internal/fptower/e6.go            | 21 ++++++++++++++++
 ecc/bw6-761/internal/fptower/e6_test.go       | 25 +++++++++++++++++++
 .../template/fq12over6over2/fq12.go.tmpl      | 21 ++++++++++++++++
 .../fq12over6over2/tests/fq12.go.tmpl         | 25 +++++++++++++++++++
 14 files changed, 322 insertions(+)

diff --git a/ecc/bls12-377/internal/fptower/e12.go b/ecc/bls12-377/internal/fptower/e12.go
index cbc2606db8..06d129ad98 100644
--- a/ecc/bls12-377/internal/fptower/e12.go
+++ b/ecc/bls12-377/internal/fptower/e12.go
@@ -19,6 +19,7 @@ package fptower
 import (
 	"encoding/binary"
 	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-377/fp"
 	"math/big"
 )
@@ -387,6 +388,26 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 {
 	return z
 }
 
+// CyclotomicExp sets z=x**e and returns it
+// x must be in the cyclotomic subgroup
+func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
+	var res, xInv E12
+	xInv.InverseUnitary(x)
+	res.SetOne()
+	eNAF := make([]int8, e.BitLen()+3)
+	n := ecc.NafDecomposition(&e, eNAF[:])
+	for i := n - 1; i >= 0; i-- {
+		res.CyclotomicSquare(&res)
+		if eNAF[i] == 1 {
+			res.Mul(&res, x)
+		} else if eNAF[i] == -1 {
+			res.Mul(&res, &xInv)
+		}
+	}
+	z.Set(&res)
+	return z
+}
+
 // InverseUnitary inverse a unitary element
 func (z *E12) InverseUnitary(x *E12) *E12 {
 	return z.Conjugate(x)
diff --git a/ecc/bls12-377/internal/fptower/e12_test.go b/ecc/bls12-377/internal/fptower/e12_test.go
index a7cfec865a..c3ec4804f9 100644
--- a/ecc/bls12-377/internal/fptower/e12_test.go
+++ b/ecc/bls12-377/internal/fptower/e12_test.go
@@ -17,6 +17,7 @@
 package fptower
 
 import (
+	"math/big"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-377/fp"
@@ -192,6 +193,7 @@ func TestE12Ops(t *testing.T) {
 
 	genA := GenE12()
 	genB := GenE12()
+	genExp := GenFp()
 
 	properties.Property("[BLS12-377] sub & add should leave an element invariant", prop.ForAll(
 		func(a, b *E12) bool {
@@ -349,6 +351,29 @@ func TestE12Ops(t *testing.T) {
 		genA,
 	))
 
+	properties.Property("[BLS12-377] Exp and CyclotomicExp results must be the same in the cyclotomic subgroup", prop.ForAll(
+		func(a *E12, e fp.Element) bool {
+			var b, c, d E12
+			// put in the cyclo subgroup
+			b.Conjugate(a)
+			a.Inverse(a)
+			b.Mul(&b, a)
+			a.FrobeniusSquare(&b).Mul(a, &b)
+
+			var _e big.Int
+			k := new(big.Int).SetUint64(12)
+			e.Exp(e, k)
+			e.ToBigIntRegular(&_e)
+
+			c.Exp(a, _e)
+			d.CyclotomicExp(a, _e)
+
+			return c.Equal(&d)
+		},
+		genA,
+		genExp,
+	))
+
 	properties.Property("[BLS12-377] Frobenius of x in E12 should be equal to x^q", prop.ForAll(
 		func(a *E12) bool {
 			var b, c E12
diff --git a/ecc/bls12-381/internal/fptower/e12.go b/ecc/bls12-381/internal/fptower/e12.go
index bb92cae533..8935d7b656 100644
--- a/ecc/bls12-381/internal/fptower/e12.go
+++ b/ecc/bls12-381/internal/fptower/e12.go
@@ -19,6 +19,7 @@ package fptower
 import (
 	"encoding/binary"
 	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-381/fp"
 	"math/big"
 )
@@ -387,6 +388,26 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 {
 	return z
 }
 
+// CyclotomicExp sets z=x**e and returns it
+// x must be in the cyclotomic subgroup
+func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
+	var res, xInv E12
+	xInv.InverseUnitary(x)
+	res.SetOne()
+	eNAF := make([]int8, e.BitLen()+3)
+	n := ecc.NafDecomposition(&e, eNAF[:])
+	for i := n - 1; i >= 0; i-- {
+		res.CyclotomicSquare(&res)
+		if eNAF[i] == 1 {
+			res.Mul(&res, x)
+		} else if eNAF[i] == -1 {
+			res.Mul(&res, &xInv)
+		}
+	}
+	z.Set(&res)
+	return z
+}
+
 // InverseUnitary inverse a unitary element
 func (z *E12) InverseUnitary(x *E12) *E12 {
 	return z.Conjugate(x)
diff --git a/ecc/bls12-381/internal/fptower/e12_test.go b/ecc/bls12-381/internal/fptower/e12_test.go
index 6901a716e0..4011baaab2 100644
--- a/ecc/bls12-381/internal/fptower/e12_test.go
+++ b/ecc/bls12-381/internal/fptower/e12_test.go
@@ -17,6 +17,7 @@
 package fptower
 
 import (
+	"math/big"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-381/fp"
@@ -192,6 +193,7 @@ func TestE12Ops(t *testing.T) {
 
 	genA := GenE12()
 	genB := GenE12()
+	genExp := GenFp()
 
 	properties.Property("[BLS12-381] sub & add should leave an element invariant", prop.ForAll(
 		func(a, b *E12) bool {
@@ -349,6 +351,29 @@ func TestE12Ops(t *testing.T) {
 		genA,
 	))
 
+	properties.Property("[BLS12-381] Exp and CyclotomicExp results must be the same in the cyclotomic subgroup", prop.ForAll(
+		func(a *E12, e fp.Element) bool {
+			var b, c, d E12
+			// put in the cyclo subgroup
+			b.Conjugate(a)
+			a.Inverse(a)
+			b.Mul(&b, a)
+			a.FrobeniusSquare(&b).Mul(a, &b)
+
+			var _e big.Int
+			k := new(big.Int).SetUint64(12)
+			e.Exp(e, k)
+			e.ToBigIntRegular(&_e)
+
+			c.Exp(a, _e)
+			d.CyclotomicExp(a, _e)
+
+			return c.Equal(&d)
+		},
+		genA,
+		genExp,
+	))
+
 	properties.Property("[BLS12-381] Frobenius of x in E12 should be equal to x^q", prop.ForAll(
 		func(a *E12) bool {
 			var b, c E12
diff --git a/ecc/bls24-315/internal/fptower/e24.go b/ecc/bls24-315/internal/fptower/e24.go
index c76a520102..ed8444f448 100644
--- a/ecc/bls24-315/internal/fptower/e24.go
+++ b/ecc/bls24-315/internal/fptower/e24.go
@@ -18,6 +18,7 @@ package fptower
 
 import (
 	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
 	"math/big"
 )
 
@@ -385,6 +386,26 @@ func (z *E24) Exp(x *E24, e big.Int) *E24 {
 	return z
 }
 
+// CyclotomicExp sets z=x**e and returns it
+// x must be in the cyclotomic subgroup
+func (z *E24) CyclotomicExp(x *E24, e big.Int) *E24 {
+	var res, xInv E24
+	xInv.InverseUnitary(x)
+	res.SetOne()
+	eNAF := make([]int8, e.BitLen()+3)
+	n := ecc.NafDecomposition(&e, eNAF[:])
+	for i := n - 1; i >= 0; i-- {
+		res.CyclotomicSquare(&res)
+		if eNAF[i] == 1 {
+			res.Mul(&res, x)
+		} else if eNAF[i] == -1 {
+			res.Mul(&res, &xInv)
+		}
+	}
+	z.Set(&res)
+	return z
+}
+
 // InverseUnitary inverse a unitary element
 func (z *E24) InverseUnitary(x *E24) *E24 {
 	return z.Conjugate(x)
diff --git a/ecc/bls24-315/internal/fptower/e24_test.go b/ecc/bls24-315/internal/fptower/e24_test.go
index 32edf7e915..a67224737e 100644
--- a/ecc/bls24-315/internal/fptower/e24_test.go
+++ b/ecc/bls24-315/internal/fptower/e24_test.go
@@ -17,6 +17,7 @@
 package fptower
 
 import (
+	"math/big"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls24-315/fp"
@@ -192,6 +193,7 @@ func TestE24Ops(t *testing.T) {
 
 	genA := GenE24()
 	genB := GenE24()
+	genExp := GenFp()
 
 	properties.Property("[BLS24-315] sub & add should leave an element invariant", prop.ForAll(
 		func(a, b *E24) bool {
@@ -371,6 +373,29 @@ func TestE24Ops(t *testing.T) {
 		genA,
 	))
 
+	properties.Property("[BLS24-315] Exp and CyclotomicExp results must be the same in the cyclotomic subgroup", prop.ForAll(
+		func(a *E24, e fp.Element) bool {
+			var b, c, d E24
+			// put in the cyclo subgroup
+			b.Conjugate(a)
+			a.Inverse(a)
+			b.Mul(&b, a)
+			a.FrobeniusQuad(&b).Mul(a, &b)
+
+			var _e big.Int
+			k := new(big.Int).SetUint64(24)
+			e.Exp(e, k)
+			e.ToBigIntRegular(&_e)
+
+			c.Exp(a, _e)
+			d.CyclotomicExp(a, _e)
+
+			return c.Equal(&d)
+		},
+		genA,
+		genExp,
+	))
+
 	properties.Property("[BLS24-315] Frobenius of x in E24 should be equal to x^q", prop.ForAll(
 		func(a *E24) bool {
 			var b, c E24
diff --git a/ecc/bn254/internal/fptower/e12.go b/ecc/bn254/internal/fptower/e12.go
index 3f8c763fb8..e29f7d0a93 100644
--- a/ecc/bn254/internal/fptower/e12.go
+++ b/ecc/bn254/internal/fptower/e12.go
@@ -19,6 +19,7 @@ package fptower
 import (
 	"encoding/binary"
 	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bn254/fp"
 	"math/big"
 )
@@ -387,6 +388,26 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 {
 	return z
 }
 
+// CyclotomicExp sets z=x**e and returns it
+// x must be in the cyclotomic subgroup
+func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
+	var res, xInv E12
+	xInv.InverseUnitary(x)
+	res.SetOne()
+	eNAF := make([]int8, e.BitLen()+3)
+	n := ecc.NafDecomposition(&e, eNAF[:])
+	for i := n - 1; i >= 0; i-- {
+		res.CyclotomicSquare(&res)
+		if eNAF[i] == 1 {
+			res.Mul(&res, x)
+		} else if eNAF[i] == -1 {
+			res.Mul(&res, &xInv)
+		}
+	}
+	z.Set(&res)
+	return z
+}
+
 // InverseUnitary inverse a unitary element
 func (z *E12) InverseUnitary(x *E12) *E12 {
 	return z.Conjugate(x)
diff --git a/ecc/bn254/internal/fptower/e12_test.go b/ecc/bn254/internal/fptower/e12_test.go
index 6c56c86805..942720cbe3 100644
--- a/ecc/bn254/internal/fptower/e12_test.go
+++ b/ecc/bn254/internal/fptower/e12_test.go
@@ -17,6 +17,7 @@
 package fptower
 
 import (
+	"math/big"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bn254/fp"
@@ -192,6 +193,7 @@ func TestE12Ops(t *testing.T) {
 
 	genA := GenE12()
 	genB := GenE12()
+	genExp := GenFp()
 
 	properties.Property("[BN254] sub & add should leave an element invariant", prop.ForAll(
 		func(a, b *E12) bool {
@@ -349,6 +351,29 @@ func TestE12Ops(t *testing.T) {
 		genA,
 	))
 
+	properties.Property("[BN254] Exp and CyclotomicExp results must be the same in the cyclotomic subgroup", prop.ForAll(
+		func(a *E12, e fp.Element) bool {
+			var b, c, d E12
+			// put in the cyclo subgroup
+			b.Conjugate(a)
+			a.Inverse(a)
+			b.Mul(&b, a)
+			a.FrobeniusSquare(&b).Mul(a, &b)
+
+			var _e big.Int
+			k := new(big.Int).SetUint64(12)
+			e.Exp(e, k)
+			e.ToBigIntRegular(&_e)
+
+			c.Exp(a, _e)
+			d.CyclotomicExp(a, _e)
+
+			return c.Equal(&d)
+		},
+		genA,
+		genExp,
+	))
+
 	properties.Property("[BN254] Frobenius of x in E12 should be equal to x^q", prop.ForAll(
 		func(a *E12) bool {
 			var b, c E12
diff --git a/ecc/bw6-633/internal/fptower/e6.go b/ecc/bw6-633/internal/fptower/e6.go
index 19e6c860e7..c938b91f3f 100644
--- a/ecc/bw6-633/internal/fptower/e6.go
+++ b/ecc/bw6-633/internal/fptower/e6.go
@@ -20,6 +20,7 @@ import (
 	"errors"
 	"math/big"
 
+	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fp"
 )
 
@@ -330,6 +331,26 @@ func (z *E6) Exp(x *E6, e big.Int) *E6 {
 	return z
 }
 
+// CyclotomicExp sets z=x**e and returns it
+// x must be in the cyclotomic subgroup
+func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 {
+	var res, xInv E6
+	xInv.InverseUnitary(x)
+	res.SetOne()
+	eNAF := make([]int8, e.BitLen()+3)
+	n := ecc.NafDecomposition(&e, eNAF[:])
+	for i := n - 1; i >= 0; i-- {
+		res.CyclotomicSquare(&res)
+		if eNAF[i] == 1 {
+			res.Mul(&res, x)
+		} else if eNAF[i] == -1 {
+			res.Mul(&res, &xInv)
+		}
+	}
+	z.Set(&res)
+	return z
+}
+
 // InverseUnitary inverse a unitary element
 func (z *E6) InverseUnitary(x *E6) *E6 {
 	return z.Conjugate(x)
diff --git a/ecc/bw6-633/internal/fptower/e6_test.go b/ecc/bw6-633/internal/fptower/e6_test.go
index 49e6cb7bf0..51c5cbce43 100644
--- a/ecc/bw6-633/internal/fptower/e6_test.go
+++ b/ecc/bw6-633/internal/fptower/e6_test.go
@@ -17,6 +17,7 @@
 package fptower
 
 import (
+	"math/big"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fp"
@@ -172,6 +173,7 @@ func TestE6Ops(t *testing.T) {
 
 	genA := GenE6()
 	genB := GenE6()
+	genExp := GenFp()
 
 	properties.Property("[BW6-633] sub & add should leave an element invariant", prop.ForAll(
 		func(a, b *E6) bool {
@@ -277,6 +279,29 @@ func TestE6Ops(t *testing.T) {
 		genA,
 	))
 
+	properties.Property("[BW6-633] Exp and CyclotomicExp results must be the same in the cyclotomic subgroup", prop.ForAll(
+		func(a *E6, e fp.Element) bool {
+			var b, c, d E6
+			// put in the cyclo subgroup
+			b.Conjugate(a)
+			a.Inverse(a)
+			b.Mul(&b, a)
+			a.Frobenius(&b).Mul(a, &b)
+
+			var _e big.Int
+			k := new(big.Int).SetUint64(6)
+			e.Exp(e, k)
+			e.ToBigIntRegular(&_e)
+
+			c.Exp(a, _e)
+			d.CyclotomicExp(a, _e)
+
+			return c.Equal(&d)
+		},
+		genA,
+		genExp,
+	))
+
 	properties.Property("[BW6-633] Frobenius of x in E6 should be equal to x^q", prop.ForAll(
 		func(a *E6) bool {
 			var b, c E6
diff --git a/ecc/bw6-761/internal/fptower/e6.go b/ecc/bw6-761/internal/fptower/e6.go
index b8027d9bcf..6261475843 100644
--- a/ecc/bw6-761/internal/fptower/e6.go
+++ b/ecc/bw6-761/internal/fptower/e6.go
@@ -20,6 +20,7 @@ import (
 	"errors"
 	"math/big"
 
+	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fp"
 )
 
@@ -329,6 +330,26 @@ func (z *E6) Exp(x *E6, e big.Int) *E6 {
 	return z
 }
 
+// CyclotomicExp sets z=x**e and returns it
+// x must be in the cyclotomic subgroup
+func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 {
+	var res, xInv E6
+	xInv.InverseUnitary(x)
+	res.SetOne()
+	eNAF := make([]int8, e.BitLen()+3)
+	n := ecc.NafDecomposition(&e, eNAF[:])
+	for i := n - 1; i >= 0; i-- {
+		res.CyclotomicSquare(&res)
+		if eNAF[i] == 1 {
+			res.Mul(&res, x)
+		} else if eNAF[i] == -1 {
+			res.Mul(&res, &xInv)
+		}
+	}
+	z.Set(&res)
+	return z
+}
+
 // InverseUnitary inverse a unitary element
 func (z *E6) InverseUnitary(x *E6) *E6 {
 	return z.Conjugate(x)
diff --git a/ecc/bw6-761/internal/fptower/e6_test.go b/ecc/bw6-761/internal/fptower/e6_test.go
index 44aeb66186..0e693628cc 100644
--- a/ecc/bw6-761/internal/fptower/e6_test.go
+++ b/ecc/bw6-761/internal/fptower/e6_test.go
@@ -17,6 +17,7 @@
 package fptower
 
 import (
+	"math/big"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fp"
@@ -172,6 +173,7 @@ func TestE6Ops(t *testing.T) {
 
 	genA := GenE6()
 	genB := GenE6()
+	genExp := GenFp()
 
 	properties.Property("[BW6-761] sub & add should leave an element invariant", prop.ForAll(
 		func(a, b *E6) bool {
@@ -277,6 +279,29 @@ func TestE6Ops(t *testing.T) {
 		genA,
 	))
 
+	properties.Property("[BW6-761] Exp and CyclotomicExp results must be the same in the cyclotomic subgroup", prop.ForAll(
+		func(a *E6, e fp.Element) bool {
+			var b, c, d E6
+			// put in the cyclo subgroup
+			b.Conjugate(a)
+			a.Inverse(a)
+			b.Mul(&b, a)
+			a.Frobenius(&b).Mul(a, &b)
+
+			var _e big.Int
+			k := new(big.Int).SetUint64(6)
+			e.Exp(e, k)
+			e.ToBigIntRegular(&_e)
+
+			c.Exp(a, _e)
+			d.CyclotomicExp(a, _e)
+
+			return c.Equal(&d)
+		},
+		genA,
+		genExp,
+	))
+
 	properties.Property("[BW6-761] Frobenius of x in E6 should be equal to x^q", prop.ForAll(
 		func(a *E6) bool {
 			var b, c E6
diff --git a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl
index 07fd9fad1b..1982772154 100644
--- a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl
+++ b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl
@@ -2,6 +2,7 @@ import (
 	"math/big"
 	"encoding/binary"
 	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/fp"
 )
 
@@ -370,6 +371,26 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 {
 	return z
 }
 
+// CyclotomicExp sets z=x**e and returns it
+// x must be in the cyclotomic subgroup
+func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
+	var res, xInv E12
+	xInv.InverseUnitary(x)
+	res.SetOne()
+	eNAF := make([]int8, e.BitLen()+3)
+	n := ecc.NafDecomposition(&e, eNAF[:])
+	for i := n - 1; i >= 0; i-- {
+		res.CyclotomicSquare(&res)
+		if eNAF[i] == 1 {
+			res.Mul(&res, x)
+		} else if eNAF[i] == -1 {
+			res.Mul(&res, &xInv)
+		}
+	}
+	z.Set(&res)
+	return z
+}
+
 // InverseUnitary inverse a unitary element
 func (z *E12) InverseUnitary(x *E12) *E12 {
 	return z.Conjugate(x)
diff --git a/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl b/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl
index 126132c0d0..abd986863f 100644
--- a/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl
+++ b/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl
@@ -1,4 +1,5 @@
 import (
+	"math/big"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/fp"
@@ -174,6 +175,7 @@ func TestE12Ops(t *testing.T) {
 
 	genA := GenE12()
 	genB := GenE12()
+	genExp := GenFp()
 
 	properties.Property("[{{ toUpper .Name }}] sub & add should leave an element invariant", prop.ForAll(
 		func(a, b *E12) bool {
@@ -331,6 +333,29 @@ func TestE12Ops(t *testing.T) {
 		genA,
 	))
 
+	properties.Property("[{{ toUpper .Name }}] Exp and CyclotomicExp results must be the same in the cyclotomic subgroup", prop.ForAll(
+		func(a *E12, e fp.Element) bool {
+			var b, c, d E12
+			// put in the cyclo subgroup
+			b.Conjugate(a)
+			a.Inverse(a)
+			b.Mul(&b, a)
+			a.FrobeniusSquare(&b).Mul(a, &b)
+
+			var _e big.Int
+			k := new(big.Int).SetUint64(12)
+			e.Exp(e, k)
+			e.ToBigIntRegular(&_e)
+
+			c.Exp(a, _e)
+			d.CyclotomicExp(a, _e)
+
+			return c.Equal(&d)
+		},
+		genA,
+		genExp,
+	))
+
 	properties.Property("[{{ toUpper .Name }}] Frobenius of x in E12 should be equal to x^q", prop.ForAll(
 		func(a *E12) bool {
 			var b, c E12

From feb3e16e573a3cb07669f753f74f1d0f800dc8c9 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Tue, 4 Jan 2022 18:29:59 +0100
Subject: [PATCH 02/16] perf(E12, E24, E6): exponentiation using 2-bit
 windowing method

---
 ecc/bls12-377/internal/fptower/e12.go         | 22 ++++++++++++++-----
 ecc/bls12-381/internal/fptower/e12.go         | 22 ++++++++++++++-----
 ecc/bls24-315/internal/fptower/e24.go         | 22 ++++++++++++++-----
 ecc/bn254/internal/fptower/e12.go             | 22 ++++++++++++++-----
 ecc/bw6-633/internal/fptower/e6.go            | 22 ++++++++++++++-----
 ecc/bw6-761/internal/fptower/e6.go            | 22 ++++++++++++++-----
 .../template/fq12over6over2/fq12.go.tmpl      | 22 ++++++++++++++-----
 7 files changed, 112 insertions(+), 42 deletions(-)

diff --git a/ecc/bls12-377/internal/fptower/e12.go b/ecc/bls12-377/internal/fptower/e12.go
index 06d129ad98..6939f52eb8 100644
--- a/ecc/bls12-377/internal/fptower/e12.go
+++ b/ecc/bls12-377/internal/fptower/e12.go
@@ -369,22 +369,32 @@ func (z *E12) Inverse(x *E12) *E12 {
 }
 
 // Exp sets z=x**e and returns it
+// uses 2-bits windowed method
 func (z *E12) Exp(x *E12, e big.Int) *E12 {
+
 	var res E12
+	var ops [3]E12
+
 	res.SetOne()
+	ops[0].Set(x)
+	ops[1].Square(&ops[0])
+	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
+
 	b := e.Bytes()
 	for i := range b {
 		w := b[i]
-		mask := byte(0x80)
-		for j := 7; j >= 0; j-- {
-			res.Square(&res)
-			if (w&mask)>>j != 0 {
-				res.Mul(&res, x)
+		mask := byte(0xc0)
+		for j := 0; j < 4; j++ {
+			res.Square(&res).Square(&res)
+			c := (w & mask) >> (6 - 2*j)
+			if c != 0 {
+				res.Mul(&res, &ops[c-1])
 			}
-			mask = mask >> 1
+			mask = mask >> 2
 		}
 	}
 	z.Set(&res)
+
 	return z
 }
 
diff --git a/ecc/bls12-381/internal/fptower/e12.go b/ecc/bls12-381/internal/fptower/e12.go
index 8935d7b656..572c97e5bd 100644
--- a/ecc/bls12-381/internal/fptower/e12.go
+++ b/ecc/bls12-381/internal/fptower/e12.go
@@ -369,22 +369,32 @@ func (z *E12) Inverse(x *E12) *E12 {
 }
 
 // Exp sets z=x**e and returns it
+// uses 2-bits windowed method
 func (z *E12) Exp(x *E12, e big.Int) *E12 {
+
 	var res E12
+	var ops [3]E12
+
 	res.SetOne()
+	ops[0].Set(x)
+	ops[1].Square(&ops[0])
+	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
+
 	b := e.Bytes()
 	for i := range b {
 		w := b[i]
-		mask := byte(0x80)
-		for j := 7; j >= 0; j-- {
-			res.Square(&res)
-			if (w&mask)>>j != 0 {
-				res.Mul(&res, x)
+		mask := byte(0xc0)
+		for j := 0; j < 4; j++ {
+			res.Square(&res).Square(&res)
+			c := (w & mask) >> (6 - 2*j)
+			if c != 0 {
+				res.Mul(&res, &ops[c-1])
 			}
-			mask = mask >> 1
+			mask = mask >> 2
 		}
 	}
 	z.Set(&res)
+
 	return z
 }
 
diff --git a/ecc/bls24-315/internal/fptower/e24.go b/ecc/bls24-315/internal/fptower/e24.go
index ed8444f448..4863295236 100644
--- a/ecc/bls24-315/internal/fptower/e24.go
+++ b/ecc/bls24-315/internal/fptower/e24.go
@@ -367,22 +367,32 @@ func (z *E24) Inverse(x *E24) *E24 {
 }
 
 // Exp sets z=x**e and returns it
+// uses 2-bits windowed method
 func (z *E24) Exp(x *E24, e big.Int) *E24 {
+
 	var res E24
+	var ops [3]E24
+
 	res.SetOne()
+	ops[0].Set(x)
+	ops[1].Square(&ops[0])
+	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
+
 	b := e.Bytes()
 	for i := range b {
 		w := b[i]
-		mask := byte(0x80)
-		for j := 7; j >= 0; j-- {
-			res.Square(&res)
-			if (w&mask)>>j != 0 {
-				res.Mul(&res, x)
+		mask := byte(0xc0)
+		for j := 0; j < 4; j++ {
+			res.Square(&res).Square(&res)
+			c := (w & mask) >> (6 - 2*j)
+			if c != 0 {
+				res.Mul(&res, &ops[c-1])
 			}
-			mask = mask >> 1
+			mask = mask >> 2
 		}
 	}
 	z.Set(&res)
+
 	return z
 }
 
diff --git a/ecc/bn254/internal/fptower/e12.go b/ecc/bn254/internal/fptower/e12.go
index e29f7d0a93..15e808a9b1 100644
--- a/ecc/bn254/internal/fptower/e12.go
+++ b/ecc/bn254/internal/fptower/e12.go
@@ -369,22 +369,32 @@ func (z *E12) Inverse(x *E12) *E12 {
 }
 
 // Exp sets z=x**e and returns it
+// uses 2-bits windowed method
 func (z *E12) Exp(x *E12, e big.Int) *E12 {
+
 	var res E12
+	var ops [3]E12
+
 	res.SetOne()
+	ops[0].Set(x)
+	ops[1].Square(&ops[0])
+	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
+
 	b := e.Bytes()
 	for i := range b {
 		w := b[i]
-		mask := byte(0x80)
-		for j := 7; j >= 0; j-- {
-			res.Square(&res)
-			if (w&mask)>>j != 0 {
-				res.Mul(&res, x)
+		mask := byte(0xc0)
+		for j := 0; j < 4; j++ {
+			res.Square(&res).Square(&res)
+			c := (w & mask) >> (6 - 2*j)
+			if c != 0 {
+				res.Mul(&res, &ops[c-1])
 			}
-			mask = mask >> 1
+			mask = mask >> 2
 		}
 	}
 	z.Set(&res)
+
 	return z
 }
 
diff --git a/ecc/bw6-633/internal/fptower/e6.go b/ecc/bw6-633/internal/fptower/e6.go
index c938b91f3f..11d192b912 100644
--- a/ecc/bw6-633/internal/fptower/e6.go
+++ b/ecc/bw6-633/internal/fptower/e6.go
@@ -312,22 +312,32 @@ func (z *E6) Inverse(x *E6) *E6 {
 }
 
 // Exp sets z=x**e and returns it
+// uses 2-bits windowed method
 func (z *E6) Exp(x *E6, e big.Int) *E6 {
+
 	var res E6
+	var ops [3]E6
+
 	res.SetOne()
+	ops[0].Set(x)
+	ops[1].Square(&ops[0])
+	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
+
 	b := e.Bytes()
 	for i := range b {
 		w := b[i]
-		mask := byte(0x80)
-		for j := 7; j >= 0; j-- {
-			res.Square(&res)
-			if (w&mask)>>j != 0 {
-				res.Mul(&res, x)
+		mask := byte(0xc0)
+		for j := 0; j < 4; j++ {
+			res.Square(&res).Square(&res)
+			c := (w & mask) >> (6 - 2*j)
+			if c != 0 {
+				res.Mul(&res, &ops[c-1])
 			}
-			mask = mask >> 1
+			mask = mask >> 2
 		}
 	}
 	z.Set(&res)
+
 	return z
 }
 
diff --git a/ecc/bw6-761/internal/fptower/e6.go b/ecc/bw6-761/internal/fptower/e6.go
index 6261475843..8dda8eaa86 100644
--- a/ecc/bw6-761/internal/fptower/e6.go
+++ b/ecc/bw6-761/internal/fptower/e6.go
@@ -311,22 +311,32 @@ func (z *E6) Inverse(x *E6) *E6 {
 }
 
 // Exp sets z=x**e and returns it
+// uses 2-bits windowed method
 func (z *E6) Exp(x *E6, e big.Int) *E6 {
+
 	var res E6
+	var ops [3]E6
+
 	res.SetOne()
+	ops[0].Set(x)
+	ops[1].Square(&ops[0])
+	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
+
 	b := e.Bytes()
 	for i := range b {
 		w := b[i]
-		mask := byte(0x80)
-		for j := 7; j >= 0; j-- {
-			res.Square(&res)
-			if (w&mask)>>j != 0 {
-				res.Mul(&res, x)
+		mask := byte(0xc0)
+		for j := 0; j < 4; j++ {
+			res.Square(&res).Square(&res)
+			c := (w & mask) >> (6 - 2*j)
+			if c != 0 {
+				res.Mul(&res, &ops[c-1])
 			}
-			mask = mask >> 1
+			mask = mask >> 2
 		}
 	}
 	z.Set(&res)
+
 	return z
 }
 
diff --git a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl
index 1982772154..616a393bf0 100644
--- a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl
+++ b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl
@@ -352,22 +352,32 @@ func (z *E12) Inverse(x *E12) *E12 {
 }
 
 // Exp sets z=x**e and returns it
+// uses 2-bits windowed method
 func (z *E12) Exp(x *E12, e big.Int) *E12 {
+
 	var res E12
+	var ops [3]E12
+
 	res.SetOne()
+	ops[0].Set(x)
+	ops[1].Square(&ops[0])
+	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
+
 	b := e.Bytes()
 	for i := range b {
 		w := b[i]
-		mask := byte(0x80)
-		for j := 7; j >= 0; j-- {
-			res.Square(&res)
-			if (w&mask)>>j != 0 {
-				res.Mul(&res, x)
+		mask := byte(0xc0)
+		for j := 0; j < 4; j++ {
+			res.Square(&res).Square(&res)
+			c := (w & mask) >> (6 - 2*j)
+			if c != 0 {
+				res.Mul(&res, &ops[c-1])
 			}
-			mask = mask >> 1
+			mask = mask >> 2
 		}
 	}
 	z.Set(&res)
+
 	return z
 }
 

From 1587c99f4f567963fc216c27cfe7b1c6a24e5479 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Mon, 24 Jan 2022 14:59:50 +0100
Subject: [PATCH 03/16] feat(GT): exponentiation in GT using 2-dim windowed GLV

---
 ecc/bls12-377/internal/fptower/e12.go         | 71 +++++++++++++++++++
 ecc/bls12-377/internal/fptower/parameters.go  | 33 +++++++++
 ecc/bls12-377/pairing_test.go                 | 59 +++++++++++++++
 ecc/bls12-381/internal/fptower/e12.go         | 71 +++++++++++++++++++
 ecc/bls12-381/internal/fptower/parameters.go  | 33 +++++++++
 ecc/bls12-381/pairing_test.go                 | 59 +++++++++++++++
 ecc/bls24-315/internal/fptower/e24.go         | 71 +++++++++++++++++++
 ecc/bls24-315/internal/fptower/parameters.go  | 33 +++++++++
 ecc/bls24-315/pairing_test.go                 | 61 ++++++++++++++++
 ecc/bn254/internal/fptower/e12.go             | 71 +++++++++++++++++++
 ecc/bn254/internal/fptower/parameters.go      | 33 +++++++++
 ecc/bn254/pairing_test.go                     | 59 +++++++++++++++
 ecc/bw6-633/internal/fptower/e6.go            | 71 +++++++++++++++++++
 ecc/bw6-633/internal/fptower/parameters.go    | 33 +++++++++
 ecc/bw6-633/pairing_test.go                   | 61 ++++++++++++++++
 ecc/bw6-761/internal/fptower/e6.go            | 71 +++++++++++++++++++
 ecc/bw6-761/internal/fptower/parameters.go    | 33 +++++++++
 ecc/bw6-761/pairing_test.go                   | 61 ++++++++++++++++
 .../pairing/template/tests/pairing.go.tmpl    | 69 ++++++++++++++++++
 .../template/fq12over6over2/fq12.go.tmpl      | 71 +++++++++++++++++++
 20 files changed, 1124 insertions(+)
 create mode 100644 ecc/bls12-377/internal/fptower/parameters.go
 create mode 100644 ecc/bls12-381/internal/fptower/parameters.go
 create mode 100644 ecc/bls24-315/internal/fptower/parameters.go
 create mode 100644 ecc/bn254/internal/fptower/parameters.go
 create mode 100644 ecc/bw6-633/internal/fptower/parameters.go
 create mode 100644 ecc/bw6-761/internal/fptower/parameters.go

diff --git a/ecc/bls12-377/internal/fptower/e12.go b/ecc/bls12-377/internal/fptower/e12.go
index 6939f52eb8..9c1144dea3 100644
--- a/ecc/bls12-377/internal/fptower/e12.go
+++ b/ecc/bls12-377/internal/fptower/e12.go
@@ -21,6 +21,7 @@ import (
 	"errors"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-377/fp"
+	"github.com/consensys/gnark-crypto/ecc/bls12-377/fr"
 	"math/big"
 )
 
@@ -399,7 +400,9 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 {
 }
 
 // CyclotomicExp sets z=x**e and returns it
+// uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
+// TODO: use a windowed method
 func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
 	var res, xInv E12
 	xInv.InverseUnitary(x)
@@ -418,6 +421,74 @@ func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
 	return z
 }
 
+// ExpGLV sets z=x**e and returns it
+// uses 2-dimensional GLV with 2-bits windowed method
+// x must be in GT
+// TODO: use 2-NAF
+// TODO: use higher dimensional decomposition
+func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 {
+
+	var table [15]E12
+	var res E12
+	var k1, k2 fr.Element
+
+	res.SetOne()
+
+	// table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a
+	table[0].Set(a)
+	table[3].Frobenius(a)
+
+	// split the scalar, modifies +-x, Frob(x) accordingly
+	k := ecc.SplitScalar(s, &glvBasis)
+
+	if k[0].Sign() == -1 {
+		k[0].Neg(&k[0])
+		table[0].InverseUnitary(&table[0])
+	}
+	if k[1].Sign() == -1 {
+		k[1].Neg(&k[1])
+		table[3].InverseUnitary(&table[3])
+	}
+
+	// precompute table (2 bits sliding window)
+	// table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0
+	table[1].CyclotomicSquare(&table[0])
+	table[2].Mul(&table[1], &table[0])
+	table[4].Mul(&table[3], &table[0])
+	table[5].Mul(&table[3], &table[1])
+	table[6].Mul(&table[3], &table[2])
+	table[7].CyclotomicSquare(&table[3])
+	table[8].Mul(&table[7], &table[0])
+	table[9].Mul(&table[7], &table[1])
+	table[10].Mul(&table[7], &table[2])
+	table[11].Mul(&table[7], &table[3])
+	table[12].Mul(&table[11], &table[0])
+	table[13].Mul(&table[11], &table[1])
+	table[14].Mul(&table[11], &table[2])
+
+	// bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max
+	k1.SetBigInt(&k[0]).FromMont()
+	k2.SetBigInt(&k[1]).FromMont()
+
+	// loop starts from len(k1)/2 due to the bounds
+	for i := len(k1) / 2; i >= 0; i-- {
+		mask := uint64(3) << 62
+		for j := 0; j < 32; j++ {
+			res.CyclotomicSquare(&res).CyclotomicSquare(&res)
+			b1 := (k1[i] & mask) >> (62 - 2*j)
+			b2 := (k2[i] & mask) >> (62 - 2*j)
+			if b1|b2 != 0 {
+				s := (b2<<2 | b1)
+				res.Mul(&res, &table[s-1])
+			}
+			mask = mask >> 2
+		}
+	}
+
+	p.Set(&res)
+	return p
+}
+
 // InverseUnitary inverse a unitary element
 func (z *E12) InverseUnitary(x *E12) *E12 {
 	return z.Conjugate(x)
diff --git a/ecc/bls12-377/internal/fptower/parameters.go b/ecc/bls12-377/internal/fptower/parameters.go
new file mode 100644
index 0000000000..6f2b1d6c7e
--- /dev/null
+++ b/ecc/bls12-377/internal/fptower/parameters.go
@@ -0,0 +1,33 @@
+// Copyright 2020 ConsenSys AG
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fptower
+
+import (
+	"math/big"
+
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bls12-377/fr"
+)
+
+// generator of the curve
+var xGen big.Int
+
+var glvBasis ecc.Lattice
+
+func init() {
+	xGen.SetString("9586122913090633729", 10)
+	_r := fr.Modulus()
+	ecc.PrecomputeLattice(_r, &xGen, &glvBasis)
+}
diff --git a/ecc/bls12-377/pairing_test.go b/ecc/bls12-377/pairing_test.go
index 96af27ede2..61af6eec91 100644
--- a/ecc/bls12-377/pairing_test.go
+++ b/ecc/bls12-377/pairing_test.go
@@ -21,6 +21,7 @@ import (
 	"math/big"
 	"testing"
 
+	"github.com/consensys/gnark-crypto/ecc/bls12-377/fp"
 	"github.com/consensys/gnark-crypto/ecc/bls12-377/fr"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
@@ -39,6 +40,7 @@ func TestPairing(t *testing.T) {
 	genA := GenE12()
 	genR1 := GenFr()
 	genR2 := GenFr()
+	genP := GenFp()
 
 	properties.Property("[BLS12-377] Having the receiver as operand (final expo) should output the same result", prop.ForAll(
 		func(a GT) bool {
@@ -58,6 +60,27 @@ func TestPairing(t *testing.T) {
 		genA,
 	))
 
+	properties.Property("[BLS12-377] Exp, CyclotomicExp and ExpGLV results must be the same in GT", prop.ForAll(
+		func(a GT, e fp.Element) bool {
+			a = FinalExponentiation(&a)
+
+			var _e big.Int
+
+			k := new(big.Int).SetUint64(12)
+			e.Exp(e, k)
+			e.ToBigIntRegular(&_e)
+
+			var b, c, d GT
+			b.Exp(&a, _e)
+			c.ExpGLV(&a, &_e)
+			d.CyclotomicExp(&a, _e)
+
+			return b.Equal(&c) && c.Equal(&d)
+		},
+		genA,
+		genP,
+	))
+
 	properties.Property("[BLS12-377] Expt(Expt) and Exp(t^2) should output the same result in the cyclotomic subgroup", prop.ForAll(
 		func(a GT) bool {
 			var b, c, d GT
@@ -303,3 +326,39 @@ func BenchmarkMultiPair(b *testing.B) {
 		})
 	}
 }
+
+func BenchmarkExpGT(b *testing.B) {
+
+	var a GT
+	a.SetRandom()
+	a = FinalExponentiation(&a)
+
+	var e fp.Element
+	e.SetRandom()
+
+	k := new(big.Int).SetUint64(12)
+	e.Exp(e, k)
+	var _e big.Int
+	e.ToBigIntRegular(&_e)
+
+	b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.Exp(&a, _e)
+		}
+	})
+
+	b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.CyclotomicExp(&a, _e)
+		}
+	})
+
+	b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.ExpGLV(&a, &_e)
+		}
+	})
+}
diff --git a/ecc/bls12-381/internal/fptower/e12.go b/ecc/bls12-381/internal/fptower/e12.go
index 572c97e5bd..92345294e6 100644
--- a/ecc/bls12-381/internal/fptower/e12.go
+++ b/ecc/bls12-381/internal/fptower/e12.go
@@ -21,6 +21,7 @@ import (
 	"errors"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-381/fp"
+	"github.com/consensys/gnark-crypto/ecc/bls12-381/fr"
 	"math/big"
 )
 
@@ -399,7 +400,9 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 {
 }
 
 // CyclotomicExp sets z=x**e and returns it
+// uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
+// TODO: use a windowed method
 func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
 	var res, xInv E12
 	xInv.InverseUnitary(x)
@@ -418,6 +421,74 @@ func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
 	return z
 }
 
+// ExpGLV sets z=x**e and returns it
+// uses 2-dimensional GLV with 2-bits windowed method
+// x must be in GT
+// TODO: use 2-NAF
+// TODO: use higher dimensional decomposition
+func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 {
+
+	var table [15]E12
+	var res E12
+	var k1, k2 fr.Element
+
+	res.SetOne()
+
+	// table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a
+	table[0].Set(a)
+	table[3].Frobenius(a)
+
+	// split the scalar, modifies +-x, Frob(x) accordingly
+	k := ecc.SplitScalar(s, &glvBasis)
+
+	if k[0].Sign() == -1 {
+		k[0].Neg(&k[0])
+		table[0].InverseUnitary(&table[0])
+	}
+	if k[1].Sign() == -1 {
+		k[1].Neg(&k[1])
+		table[3].InverseUnitary(&table[3])
+	}
+
+	// precompute table (2 bits sliding window)
+	// table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0
+	table[1].CyclotomicSquare(&table[0])
+	table[2].Mul(&table[1], &table[0])
+	table[4].Mul(&table[3], &table[0])
+	table[5].Mul(&table[3], &table[1])
+	table[6].Mul(&table[3], &table[2])
+	table[7].CyclotomicSquare(&table[3])
+	table[8].Mul(&table[7], &table[0])
+	table[9].Mul(&table[7], &table[1])
+	table[10].Mul(&table[7], &table[2])
+	table[11].Mul(&table[7], &table[3])
+	table[12].Mul(&table[11], &table[0])
+	table[13].Mul(&table[11], &table[1])
+	table[14].Mul(&table[11], &table[2])
+
+	// bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max
+	k1.SetBigInt(&k[0]).FromMont()
+	k2.SetBigInt(&k[1]).FromMont()
+
+	// loop starts from len(k1)/2 due to the bounds
+	for i := len(k1) / 2; i >= 0; i-- {
+		mask := uint64(3) << 62
+		for j := 0; j < 32; j++ {
+			res.CyclotomicSquare(&res).CyclotomicSquare(&res)
+			b1 := (k1[i] & mask) >> (62 - 2*j)
+			b2 := (k2[i] & mask) >> (62 - 2*j)
+			if b1|b2 != 0 {
+				s := (b2<<2 | b1)
+				res.Mul(&res, &table[s-1])
+			}
+			mask = mask >> 2
+		}
+	}
+
+	p.Set(&res)
+	return p
+}
+
 // InverseUnitary inverse a unitary element
 func (z *E12) InverseUnitary(x *E12) *E12 {
 	return z.Conjugate(x)
diff --git a/ecc/bls12-381/internal/fptower/parameters.go b/ecc/bls12-381/internal/fptower/parameters.go
new file mode 100644
index 0000000000..9f97e11751
--- /dev/null
+++ b/ecc/bls12-381/internal/fptower/parameters.go
@@ -0,0 +1,33 @@
+// Copyright 2020 ConsenSys AG
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fptower
+
+import (
+	"math/big"
+
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bls12-381/fr"
+)
+
+// generator of the curve
+var xGen big.Int
+
+var glvBasis ecc.Lattice
+
+func init() {
+	xGen.SetString("-15132376222941642752", 10)
+	_r := fr.Modulus()
+	ecc.PrecomputeLattice(_r, &xGen, &glvBasis)
+}
diff --git a/ecc/bls12-381/pairing_test.go b/ecc/bls12-381/pairing_test.go
index cda5bd3f94..c45923175d 100644
--- a/ecc/bls12-381/pairing_test.go
+++ b/ecc/bls12-381/pairing_test.go
@@ -21,6 +21,7 @@ import (
 	"math/big"
 	"testing"
 
+	"github.com/consensys/gnark-crypto/ecc/bls12-381/fp"
 	"github.com/consensys/gnark-crypto/ecc/bls12-381/fr"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
@@ -39,6 +40,7 @@ func TestPairing(t *testing.T) {
 	genA := GenE12()
 	genR1 := GenFr()
 	genR2 := GenFr()
+	genP := GenFp()
 
 	properties.Property("[BLS12-381] Having the receiver as operand (final expo) should output the same result", prop.ForAll(
 		func(a GT) bool {
@@ -58,6 +60,27 @@ func TestPairing(t *testing.T) {
 		genA,
 	))
 
+	properties.Property("[BLS12-381] Exp, CyclotomicExp and ExpGLV results must be the same in GT", prop.ForAll(
+		func(a GT, e fp.Element) bool {
+			a = FinalExponentiation(&a)
+
+			var _e big.Int
+
+			k := new(big.Int).SetUint64(12)
+			e.Exp(e, k)
+			e.ToBigIntRegular(&_e)
+
+			var b, c, d GT
+			b.Exp(&a, _e)
+			c.ExpGLV(&a, &_e)
+			d.CyclotomicExp(&a, _e)
+
+			return b.Equal(&c) && c.Equal(&d)
+		},
+		genA,
+		genP,
+	))
+
 	properties.Property("[BLS12-381] Expt(Expt) and Exp(t^2) should output the same result in the cyclotomic subgroup", prop.ForAll(
 		func(a GT) bool {
 			var b, c, d GT
@@ -303,3 +326,39 @@ func BenchmarkMultiPair(b *testing.B) {
 		})
 	}
 }
+
+func BenchmarkExpGT(b *testing.B) {
+
+	var a GT
+	a.SetRandom()
+	a = FinalExponentiation(&a)
+
+	var e fp.Element
+	e.SetRandom()
+
+	k := new(big.Int).SetUint64(12)
+	e.Exp(e, k)
+	var _e big.Int
+	e.ToBigIntRegular(&_e)
+
+	b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.Exp(&a, _e)
+		}
+	})
+
+	b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.CyclotomicExp(&a, _e)
+		}
+	})
+
+	b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.ExpGLV(&a, &_e)
+		}
+	})
+}
diff --git a/ecc/bls24-315/internal/fptower/e24.go b/ecc/bls24-315/internal/fptower/e24.go
index 4863295236..ad4abd0661 100644
--- a/ecc/bls24-315/internal/fptower/e24.go
+++ b/ecc/bls24-315/internal/fptower/e24.go
@@ -19,6 +19,7 @@ package fptower
 import (
 	"errors"
 	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bls24-315/fr"
 	"math/big"
 )
 
@@ -397,7 +398,9 @@ func (z *E24) Exp(x *E24, e big.Int) *E24 {
 }
 
 // CyclotomicExp sets z=x**e and returns it
+// uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
+// TODO: use a windowed method
 func (z *E24) CyclotomicExp(x *E24, e big.Int) *E24 {
 	var res, xInv E24
 	xInv.InverseUnitary(x)
@@ -416,6 +419,74 @@ func (z *E24) CyclotomicExp(x *E24, e big.Int) *E24 {
 	return z
 }
 
+// ExpGLV sets z=x**e and returns it
+// uses 2-dimensional GLV with 2-bits windowed method
+// x must be in GT
+// TODO: use 2-NAF
+// TODO: use higher dimensional decomposition
+func (p *E24) ExpGLV(a *E24, s *big.Int) *E24 {
+
+	var table [15]E24
+	var res E24
+	var k1, k2 fr.Element
+
+	res.SetOne()
+
+	// table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a
+	table[0].Set(a)
+	table[3].Frobenius(a)
+
+	// split the scalar, modifies +-x, Frob(x) accordingly
+	k := ecc.SplitScalar(s, &glvBasis)
+
+	if k[0].Sign() == -1 {
+		k[0].Neg(&k[0])
+		table[0].InverseUnitary(&table[0])
+	}
+	if k[1].Sign() == -1 {
+		k[1].Neg(&k[1])
+		table[3].InverseUnitary(&table[3])
+	}
+
+	// precompute table (2 bits sliding window)
+	// table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0
+	table[1].CyclotomicSquare(&table[0])
+	table[2].Mul(&table[1], &table[0])
+	table[4].Mul(&table[3], &table[0])
+	table[5].Mul(&table[3], &table[1])
+	table[6].Mul(&table[3], &table[2])
+	table[7].CyclotomicSquare(&table[3])
+	table[8].Mul(&table[7], &table[0])
+	table[9].Mul(&table[7], &table[1])
+	table[10].Mul(&table[7], &table[2])
+	table[11].Mul(&table[7], &table[3])
+	table[12].Mul(&table[11], &table[0])
+	table[13].Mul(&table[11], &table[1])
+	table[14].Mul(&table[11], &table[2])
+
+	// bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max
+	k1.SetBigInt(&k[0]).FromMont()
+	k2.SetBigInt(&k[1]).FromMont()
+
+	// loop starts from len(k1)/2 due to the bounds
+	for i := len(k1)/2 + 1; i >= 0; i-- {
+		mask := uint64(3) << 62
+		for j := 0; j < 32; j++ {
+			res.CyclotomicSquare(&res).CyclotomicSquare(&res)
+			b1 := (k1[i] & mask) >> (62 - 2*j)
+			b2 := (k2[i] & mask) >> (62 - 2*j)
+			if b1|b2 != 0 {
+				s := (b2<<2 | b1)
+				res.Mul(&res, &table[s-1])
+			}
+			mask = mask >> 2
+		}
+	}
+
+	p.Set(&res)
+	return p
+}
+
 // InverseUnitary inverse a unitary element
 func (z *E24) InverseUnitary(x *E24) *E24 {
 	return z.Conjugate(x)
diff --git a/ecc/bls24-315/internal/fptower/parameters.go b/ecc/bls24-315/internal/fptower/parameters.go
new file mode 100644
index 0000000000..71ac9072cd
--- /dev/null
+++ b/ecc/bls24-315/internal/fptower/parameters.go
@@ -0,0 +1,33 @@
+// Copyright 2020 ConsenSys AG
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fptower
+
+import (
+	"math/big"
+
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bls24-315/fr"
+)
+
+// generator of the curve
+var xGen big.Int
+
+var glvBasis ecc.Lattice
+
+func init() {
+	xGen.SetString("-3218079743", 10)
+	_r := fr.Modulus()
+	ecc.PrecomputeLattice(_r, &xGen, &glvBasis)
+}
diff --git a/ecc/bls24-315/pairing_test.go b/ecc/bls24-315/pairing_test.go
index e54fca70b4..7f2b9e04df 100644
--- a/ecc/bls24-315/pairing_test.go
+++ b/ecc/bls24-315/pairing_test.go
@@ -21,6 +21,7 @@ import (
 	"math/big"
 	"testing"
 
+	"github.com/consensys/gnark-crypto/ecc/bls24-315/fp"
 	"github.com/consensys/gnark-crypto/ecc/bls24-315/fr"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
@@ -40,6 +41,7 @@ func TestPairing(t *testing.T) {
 
 	genR1 := GenFr()
 	genR2 := GenFr()
+	genP := GenFp()
 
 	properties.Property("[BLS24-315] Having the receiver as operand (final expo) should output the same result", prop.ForAll(
 		func(a GT) bool {
@@ -59,6 +61,28 @@ func TestPairing(t *testing.T) {
 		genA,
 	))
 
+	properties.Property("[BLS24-315] Exp, CyclotomicExp and ExpGLV results must be the same in GT", prop.ForAll(
+		func(a GT, e fp.Element) bool {
+			a = FinalExponentiation(&a)
+
+			var _e big.Int
+
+			k := new(big.Int).SetUint64(24)
+
+			e.Exp(e, k)
+			e.ToBigIntRegular(&_e)
+
+			var b, c, d GT
+			b.Exp(&a, _e)
+			c.ExpGLV(&a, &_e)
+			d.CyclotomicExp(&a, _e)
+
+			return b.Equal(&c) && c.Equal(&d)
+		},
+		genA,
+		genP,
+	))
+
 	properties.Property("[BLS24-315] Expt(Expt) and Exp(t^2) should output the same result in the cyclotomic subgroup", prop.ForAll(
 		func(a GT) bool {
 			var b, c, d GT
@@ -304,3 +328,40 @@ func BenchmarkMultiPair(b *testing.B) {
 		})
 	}
 }
+
+func BenchmarkExpGT(b *testing.B) {
+
+	var a GT
+	a.SetRandom()
+	a = FinalExponentiation(&a)
+
+	var e fp.Element
+	e.SetRandom()
+
+	k := new(big.Int).SetUint64(24)
+
+	e.Exp(e, k)
+	var _e big.Int
+	e.ToBigIntRegular(&_e)
+
+	b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.Exp(&a, _e)
+		}
+	})
+
+	b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.CyclotomicExp(&a, _e)
+		}
+	})
+
+	b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.ExpGLV(&a, &_e)
+		}
+	})
+}
diff --git a/ecc/bn254/internal/fptower/e12.go b/ecc/bn254/internal/fptower/e12.go
index 15e808a9b1..a0afb02df2 100644
--- a/ecc/bn254/internal/fptower/e12.go
+++ b/ecc/bn254/internal/fptower/e12.go
@@ -21,6 +21,7 @@ import (
 	"errors"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bn254/fp"
+	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
 	"math/big"
 )
 
@@ -399,7 +400,9 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 {
 }
 
 // CyclotomicExp sets z=x**e and returns it
+// uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
+// TODO: use a windowed method
 func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
 	var res, xInv E12
 	xInv.InverseUnitary(x)
@@ -418,6 +421,74 @@ func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
 	return z
 }
 
+// ExpGLV sets z=x**e and returns it
+// uses 2-dimensional GLV with 2-bits windowed method
+// x must be in GT
+// TODO: use 2-NAF
+// TODO: use higher dimensional decomposition
+func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 {
+
+	var table [15]E12
+	var res E12
+	var k1, k2 fr.Element
+
+	res.SetOne()
+
+	// table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a
+	table[0].Set(a)
+	table[3].Frobenius(a)
+
+	// split the scalar, modifies +-x, Frob(x) accordingly
+	k := ecc.SplitScalar(s, &glvBasis)
+
+	if k[0].Sign() == -1 {
+		k[0].Neg(&k[0])
+		table[0].InverseUnitary(&table[0])
+	}
+	if k[1].Sign() == -1 {
+		k[1].Neg(&k[1])
+		table[3].InverseUnitary(&table[3])
+	}
+
+	// precompute table (2 bits sliding window)
+	// table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0
+	table[1].CyclotomicSquare(&table[0])
+	table[2].Mul(&table[1], &table[0])
+	table[4].Mul(&table[3], &table[0])
+	table[5].Mul(&table[3], &table[1])
+	table[6].Mul(&table[3], &table[2])
+	table[7].CyclotomicSquare(&table[3])
+	table[8].Mul(&table[7], &table[0])
+	table[9].Mul(&table[7], &table[1])
+	table[10].Mul(&table[7], &table[2])
+	table[11].Mul(&table[7], &table[3])
+	table[12].Mul(&table[11], &table[0])
+	table[13].Mul(&table[11], &table[1])
+	table[14].Mul(&table[11], &table[2])
+
+	// bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max
+	k1.SetBigInt(&k[0]).FromMont()
+	k2.SetBigInt(&k[1]).FromMont()
+
+	// loop starts from len(k1)/2 due to the bounds
+	for i := len(k1) / 2; i >= 0; i-- {
+		mask := uint64(3) << 62
+		for j := 0; j < 32; j++ {
+			res.CyclotomicSquare(&res).CyclotomicSquare(&res)
+			b1 := (k1[i] & mask) >> (62 - 2*j)
+			b2 := (k2[i] & mask) >> (62 - 2*j)
+			if b1|b2 != 0 {
+				s := (b2<<2 | b1)
+				res.Mul(&res, &table[s-1])
+			}
+			mask = mask >> 2
+		}
+	}
+
+	p.Set(&res)
+	return p
+}
+
 // InverseUnitary inverse a unitary element
 func (z *E12) InverseUnitary(x *E12) *E12 {
 	return z.Conjugate(x)
diff --git a/ecc/bn254/internal/fptower/parameters.go b/ecc/bn254/internal/fptower/parameters.go
new file mode 100644
index 0000000000..3859aeac76
--- /dev/null
+++ b/ecc/bn254/internal/fptower/parameters.go
@@ -0,0 +1,33 @@
+// Copyright 2020 ConsenSys AG
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fptower
+
+import (
+	"math/big"
+
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
+)
+
+// generator of the curve
+var xGen big.Int
+
+var glvBasis ecc.Lattice
+
+func init() {
+	xGen.SetString("147946756881789318990833708069417712966", 10)
+	_r := fr.Modulus()
+	ecc.PrecomputeLattice(_r, &xGen, &glvBasis)
+}
diff --git a/ecc/bn254/pairing_test.go b/ecc/bn254/pairing_test.go
index 8948b1872f..a875603cde 100644
--- a/ecc/bn254/pairing_test.go
+++ b/ecc/bn254/pairing_test.go
@@ -21,6 +21,7 @@ import (
 	"math/big"
 	"testing"
 
+	"github.com/consensys/gnark-crypto/ecc/bn254/fp"
 	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
@@ -39,6 +40,7 @@ func TestPairing(t *testing.T) {
 	genA := GenE12()
 	genR1 := GenFr()
 	genR2 := GenFr()
+	genP := GenFp()
 
 	properties.Property("[BN254] Having the receiver as operand (final expo) should output the same result", prop.ForAll(
 		func(a GT) bool {
@@ -58,6 +60,27 @@ func TestPairing(t *testing.T) {
 		genA,
 	))
 
+	properties.Property("[BN254] Exp, CyclotomicExp and ExpGLV results must be the same in GT", prop.ForAll(
+		func(a GT, e fp.Element) bool {
+			a = FinalExponentiation(&a)
+
+			var _e big.Int
+
+			k := new(big.Int).SetUint64(12)
+			e.Exp(e, k)
+			e.ToBigIntRegular(&_e)
+
+			var b, c, d GT
+			b.Exp(&a, _e)
+			c.ExpGLV(&a, &_e)
+			d.CyclotomicExp(&a, _e)
+
+			return b.Equal(&c) && c.Equal(&d)
+		},
+		genA,
+		genP,
+	))
+
 	properties.Property("[BN254] Expt(Expt) and Exp(t^2) should output the same result in the cyclotomic subgroup", prop.ForAll(
 		func(a GT) bool {
 			var b, c, d GT
@@ -303,3 +326,39 @@ func BenchmarkMultiPair(b *testing.B) {
 		})
 	}
 }
+
+func BenchmarkExpGT(b *testing.B) {
+
+	var a GT
+	a.SetRandom()
+	a = FinalExponentiation(&a)
+
+	var e fp.Element
+	e.SetRandom()
+
+	k := new(big.Int).SetUint64(12)
+	e.Exp(e, k)
+	var _e big.Int
+	e.ToBigIntRegular(&_e)
+
+	b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.Exp(&a, _e)
+		}
+	})
+
+	b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.CyclotomicExp(&a, _e)
+		}
+	})
+
+	b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.ExpGLV(&a, &_e)
+		}
+	})
+}
diff --git a/ecc/bw6-633/internal/fptower/e6.go b/ecc/bw6-633/internal/fptower/e6.go
index 11d192b912..6003ad5a8d 100644
--- a/ecc/bw6-633/internal/fptower/e6.go
+++ b/ecc/bw6-633/internal/fptower/e6.go
@@ -22,6 +22,7 @@ import (
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fp"
+	"github.com/consensys/gnark-crypto/ecc/bw6-633/fr"
 )
 
 // E6 is a degree two finite field extension of fp3
@@ -342,7 +343,9 @@ func (z *E6) Exp(x *E6, e big.Int) *E6 {
 }
 
 // CyclotomicExp sets z=x**e and returns it
+// uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
+// TODO: use a windowed method
 func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 {
 	var res, xInv E6
 	xInv.InverseUnitary(x)
@@ -361,6 +364,74 @@ func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 {
 	return z
 }
 
+// ExpGLV sets z=x**e and returns it
+// uses 2-dimensional GLV with 2-bits windowed method
+// x must be in GT
+// TODO: use 2-NAF
+// TODO: use higher dimensional decomposition
+func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 {
+
+	var table [15]E6
+	var res E6
+	var k1, k2 fr.Element
+
+	res.SetOne()
+
+	// table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a
+	table[0].Set(a)
+	table[3].Frobenius(a)
+
+	// split the scalar, modifies +-x, Frob(x) accordingly
+	k := ecc.SplitScalar(s, &glvBasis)
+
+	if k[0].Sign() == -1 {
+		k[0].Neg(&k[0])
+		table[0].InverseUnitary(&table[0])
+	}
+	if k[1].Sign() == -1 {
+		k[1].Neg(&k[1])
+		table[3].InverseUnitary(&table[3])
+	}
+
+	// precompute table (2 bits sliding window)
+	// table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0
+	table[1].CyclotomicSquare(&table[0])
+	table[2].Mul(&table[1], &table[0])
+	table[4].Mul(&table[3], &table[0])
+	table[5].Mul(&table[3], &table[1])
+	table[6].Mul(&table[3], &table[2])
+	table[7].CyclotomicSquare(&table[3])
+	table[8].Mul(&table[7], &table[0])
+	table[9].Mul(&table[7], &table[1])
+	table[10].Mul(&table[7], &table[2])
+	table[11].Mul(&table[7], &table[3])
+	table[12].Mul(&table[11], &table[0])
+	table[13].Mul(&table[11], &table[1])
+	table[14].Mul(&table[11], &table[2])
+
+	// bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max
+	k1.SetBigInt(&k[0]).FromMont()
+	k2.SetBigInt(&k[1]).FromMont()
+
+	// loop starts from len(k1)/2 due to the bounds
+	for i := len(k1) / 2; i >= 0; i-- {
+		mask := uint64(3) << 62
+		for j := 0; j < 32; j++ {
+			res.CyclotomicSquare(&res).CyclotomicSquare(&res)
+			b1 := (k1[i] & mask) >> (62 - 2*j)
+			b2 := (k2[i] & mask) >> (62 - 2*j)
+			if b1|b2 != 0 {
+				s := (b2<<2 | b1)
+				res.Mul(&res, &table[s-1])
+			}
+			mask = mask >> 2
+		}
+	}
+
+	p.Set(&res)
+	return p
+}
+
 // InverseUnitary inverse a unitary element
 func (z *E6) InverseUnitary(x *E6) *E6 {
 	return z.Conjugate(x)
diff --git a/ecc/bw6-633/internal/fptower/parameters.go b/ecc/bw6-633/internal/fptower/parameters.go
new file mode 100644
index 0000000000..a929cac1f1
--- /dev/null
+++ b/ecc/bw6-633/internal/fptower/parameters.go
@@ -0,0 +1,33 @@
+// Copyright 2020 ConsenSys AG
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fptower
+
+import (
+	"math/big"
+
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bw6-633/fr"
+)
+
+// generator of the curve
+var xGen big.Int
+
+var glvBasis ecc.Lattice
+
+func init() {
+	xGen.SetString("37014442673353839783463348892746893664389658635873267609916377398480286678854893830143", 10)
+	_r := fr.Modulus()
+	ecc.PrecomputeLattice(_r, &xGen, &glvBasis)
+}
diff --git a/ecc/bw6-633/pairing_test.go b/ecc/bw6-633/pairing_test.go
index bdffe61d81..fef2c615a4 100644
--- a/ecc/bw6-633/pairing_test.go
+++ b/ecc/bw6-633/pairing_test.go
@@ -21,6 +21,7 @@ import (
 	"math/big"
 	"testing"
 
+	"github.com/consensys/gnark-crypto/ecc/bw6-633/fp"
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fr"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
@@ -40,6 +41,7 @@ func TestPairing(t *testing.T) {
 
 	genR1 := GenFr()
 	genR2 := GenFr()
+	genP := GenFp()
 
 	properties.Property("[BW6-633] Having the receiver as operand (final expo) should output the same result", prop.ForAll(
 		func(a GT) bool {
@@ -59,6 +61,28 @@ func TestPairing(t *testing.T) {
 		genA,
 	))
 
+	properties.Property("[BW6-633] Exp, CyclotomicExp and ExpGLV results must be the same in GT", prop.ForAll(
+		func(a GT, e fp.Element) bool {
+			a = FinalExponentiation(&a)
+
+			var _e big.Int
+
+			k := new(big.Int).SetUint64(6)
+
+			e.Exp(e, k)
+			e.ToBigIntRegular(&_e)
+
+			var b, c, d GT
+			b.Exp(&a, _e)
+			c.ExpGLV(&a, &_e)
+			d.CyclotomicExp(&a, _e)
+
+			return b.Equal(&c) && c.Equal(&d)
+		},
+		genA,
+		genP,
+	))
+
 	properties.Property("[BW6-633] Expt(Expt) and Exp(t^2) should output the same result in the cyclotomic subgroup", prop.ForAll(
 		func(a GT) bool {
 			var b, c, d GT
@@ -304,3 +328,40 @@ func BenchmarkMultiPair(b *testing.B) {
 		})
 	}
 }
+
+func BenchmarkExpGT(b *testing.B) {
+
+	var a GT
+	a.SetRandom()
+	a = FinalExponentiation(&a)
+
+	var e fp.Element
+	e.SetRandom()
+
+	k := new(big.Int).SetUint64(6)
+
+	e.Exp(e, k)
+	var _e big.Int
+	e.ToBigIntRegular(&_e)
+
+	b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.Exp(&a, _e)
+		}
+	})
+
+	b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.CyclotomicExp(&a, _e)
+		}
+	})
+
+	b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.ExpGLV(&a, &_e)
+		}
+	})
+}
diff --git a/ecc/bw6-761/internal/fptower/e6.go b/ecc/bw6-761/internal/fptower/e6.go
index 8dda8eaa86..a2cf6e4c47 100644
--- a/ecc/bw6-761/internal/fptower/e6.go
+++ b/ecc/bw6-761/internal/fptower/e6.go
@@ -22,6 +22,7 @@ import (
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fp"
+	"github.com/consensys/gnark-crypto/ecc/bw6-761/fr"
 )
 
 // E6 is a degree two finite field extension of fp3
@@ -341,7 +342,9 @@ func (z *E6) Exp(x *E6, e big.Int) *E6 {
 }
 
 // CyclotomicExp sets z=x**e and returns it
+// uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
+// TODO: use a windowed method
 func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 {
 	var res, xInv E6
 	xInv.InverseUnitary(x)
@@ -360,6 +363,74 @@ func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 {
 	return z
 }
 
+// ExpGLV sets z=x**e and returns it
+// uses 2-dimensional GLV with 2-bits windowed method
+// x must be in GT
+// TODO: use 2-NAF
+// TODO: use higher dimensional decomposition
+func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 {
+
+	var table [15]E6
+	var res E6
+	var k1, k2 fr.Element
+
+	res.SetOne()
+
+	// table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a
+	table[0].Set(a)
+	table[3].Frobenius(a)
+
+	// split the scalar, modifies +-x, Frob(x) accordingly
+	k := ecc.SplitScalar(s, &glvBasis)
+
+	if k[0].Sign() == -1 {
+		k[0].Neg(&k[0])
+		table[0].InverseUnitary(&table[0])
+	}
+	if k[1].Sign() == -1 {
+		k[1].Neg(&k[1])
+		table[3].InverseUnitary(&table[3])
+	}
+
+	// precompute table (2 bits sliding window)
+	// table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0
+	table[1].CyclotomicSquare(&table[0])
+	table[2].Mul(&table[1], &table[0])
+	table[4].Mul(&table[3], &table[0])
+	table[5].Mul(&table[3], &table[1])
+	table[6].Mul(&table[3], &table[2])
+	table[7].CyclotomicSquare(&table[3])
+	table[8].Mul(&table[7], &table[0])
+	table[9].Mul(&table[7], &table[1])
+	table[10].Mul(&table[7], &table[2])
+	table[11].Mul(&table[7], &table[3])
+	table[12].Mul(&table[11], &table[0])
+	table[13].Mul(&table[11], &table[1])
+	table[14].Mul(&table[11], &table[2])
+
+	// bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max
+	k1.SetBigInt(&k[0]).FromMont()
+	k2.SetBigInt(&k[1]).FromMont()
+
+	// loop starts from len(k1)/2 due to the bounds
+	for i := len(k1) / 2; i >= 0; i-- {
+		mask := uint64(3) << 62
+		for j := 0; j < 32; j++ {
+			res.CyclotomicSquare(&res).CyclotomicSquare(&res)
+			b1 := (k1[i] & mask) >> (62 - 2*j)
+			b2 := (k2[i] & mask) >> (62 - 2*j)
+			if b1|b2 != 0 {
+				s := (b2<<2 | b1)
+				res.Mul(&res, &table[s-1])
+			}
+			mask = mask >> 2
+		}
+	}
+
+	p.Set(&res)
+	return p
+}
+
 // InverseUnitary inverse a unitary element
 func (z *E6) InverseUnitary(x *E6) *E6 {
 	return z.Conjugate(x)
diff --git a/ecc/bw6-761/internal/fptower/parameters.go b/ecc/bw6-761/internal/fptower/parameters.go
new file mode 100644
index 0000000000..2ec4ef2b19
--- /dev/null
+++ b/ecc/bw6-761/internal/fptower/parameters.go
@@ -0,0 +1,33 @@
+// Copyright 2020 ConsenSys AG
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fptower
+
+import (
+	"math/big"
+
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bw6-761/fr"
+)
+
+// generator of the curve
+var xGen big.Int
+
+var glvBasis ecc.Lattice
+
+func init() {
+	xGen.SetString("3362637538168598222219435186298528655381674028954528064283340709388076588006567983337308081752755143497537638367247", 10)
+	_r := fr.Modulus()
+	ecc.PrecomputeLattice(_r, &xGen, &glvBasis)
+}
diff --git a/ecc/bw6-761/pairing_test.go b/ecc/bw6-761/pairing_test.go
index 2ec8af9ac7..14dc68d169 100644
--- a/ecc/bw6-761/pairing_test.go
+++ b/ecc/bw6-761/pairing_test.go
@@ -21,6 +21,7 @@ import (
 	"math/big"
 	"testing"
 
+	"github.com/consensys/gnark-crypto/ecc/bw6-761/fp"
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fr"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
@@ -40,6 +41,7 @@ func TestPairing(t *testing.T) {
 
 	genR1 := GenFr()
 	genR2 := GenFr()
+	genP := GenFp()
 
 	properties.Property("[BW6-761] Having the receiver as operand (final expo) should output the same result", prop.ForAll(
 		func(a GT) bool {
@@ -59,6 +61,28 @@ func TestPairing(t *testing.T) {
 		genA,
 	))
 
+	properties.Property("[BW6-761] Exp, CyclotomicExp and ExpGLV results must be the same in GT", prop.ForAll(
+		func(a GT, e fp.Element) bool {
+			a = FinalExponentiation(&a)
+
+			var _e big.Int
+
+			k := new(big.Int).SetUint64(6)
+
+			e.Exp(e, k)
+			e.ToBigIntRegular(&_e)
+
+			var b, c, d GT
+			b.Exp(&a, _e)
+			c.ExpGLV(&a, &_e)
+			d.CyclotomicExp(&a, _e)
+
+			return b.Equal(&c) && c.Equal(&d)
+		},
+		genA,
+		genP,
+	))
+
 	properties.Property("[BW6-761] Expt(Expt) and Exp(t^2) should output the same result in the cyclotomic subgroup", prop.ForAll(
 		func(a GT) bool {
 			var b, c, d GT
@@ -304,3 +328,40 @@ func BenchmarkMultiPair(b *testing.B) {
 		})
 	}
 }
+
+func BenchmarkExpGT(b *testing.B) {
+
+	var a GT
+	a.SetRandom()
+	a = FinalExponentiation(&a)
+
+	var e fp.Element
+	e.SetRandom()
+
+	k := new(big.Int).SetUint64(6)
+
+	e.Exp(e, k)
+	var _e big.Int
+	e.ToBigIntRegular(&_e)
+
+	b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.Exp(&a, _e)
+		}
+	})
+
+	b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.CyclotomicExp(&a, _e)
+		}
+	})
+
+	b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.ExpGLV(&a, &_e)
+		}
+	})
+}
diff --git a/internal/generator/pairing/template/tests/pairing.go.tmpl b/internal/generator/pairing/template/tests/pairing.go.tmpl
index 09a0840a96..fcf4058bc7 100644
--- a/internal/generator/pairing/template/tests/pairing.go.tmpl
+++ b/internal/generator/pairing/template/tests/pairing.go.tmpl
@@ -4,6 +4,7 @@ import (
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/fr"
+	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/fp"
     "github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
 )
@@ -27,6 +28,7 @@ func TestPairing(t *testing.T) {
     {{- end}}
 	genR1 := GenFr()
 	genR2 := GenFr()
+	genP := GenFp()
 
 	properties.Property("[{{ toUpper .Name}}] Having the receiver as operand (final expo) should output the same result", prop.ForAll(
 		func(a GT) bool {
@@ -46,6 +48,32 @@ func TestPairing(t *testing.T) {
 		genA,
 	))
 
+	properties.Property("[{{ toUpper .Name}}] Exp, CyclotomicExp and ExpGLV results must be the same in GT", prop.ForAll(
+		func(a GT, e fp.Element) bool {
+			a = FinalExponentiation(&a)
+
+			var _e big.Int
+            {{if or (eq .Name "bw6-761") (eq .Name "bw6-633")}}
+            k := new(big.Int).SetUint64(6)
+            {{else if eq .Name "bls24-315"}}
+            k := new(big.Int).SetUint64(24)
+            {{ else }}
+            k := new(big.Int).SetUint64(12)
+            {{- end}}
+			e.Exp(e, k)
+			e.ToBigIntRegular(&_e)
+
+			var b, c, d GT
+			b.Exp(&a, _e)
+			c.ExpGLV(&a, &_e)
+			d.CyclotomicExp(&a, _e)
+
+			return b.Equal(&c) && c.Equal(&d)
+		},
+		genA,
+		genP,
+	))
+
 	properties.Property("[{{ toUpper .Name}}] Expt(Expt) and Exp(t^2) should output the same result in the cyclotomic subgroup", prop.ForAll(
 		func(a GT) bool {
 			var b, c, d GT
@@ -296,3 +324,44 @@ func BenchmarkMultiPair(b *testing.B) {
 		})
 	}
 }
+
+func BenchmarkExpGT(b *testing.B) {
+
+	var a GT
+	a.SetRandom()
+	a = FinalExponentiation(&a)
+
+	var e fp.Element
+	e.SetRandom()
+    {{if or (eq .Name "bw6-761") (eq .Name "bw6-633")}}
+    k := new(big.Int).SetUint64(6)
+    {{else if eq .Name "bls24-315"}}
+    k := new(big.Int).SetUint64(24)
+    {{ else }}
+    k := new(big.Int).SetUint64(12)
+    {{- end}}
+	e.Exp(e, k)
+	var _e big.Int
+	e.ToBigIntRegular(&_e)
+
+	b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.Exp(&a, _e)
+		}
+	})
+
+	b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.CyclotomicExp(&a, _e)
+		}
+	})
+
+	b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.ExpGLV(&a, &_e)
+		}
+	})
+}
diff --git a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl
index 616a393bf0..25f9b4eaf9 100644
--- a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl
+++ b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl
@@ -4,6 +4,7 @@ import (
 	"errors"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/fp"
+	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/fr"
 )
 
 // E12 is a degree two finite field extension of fp6
@@ -382,7 +383,9 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 {
 }
 
 // CyclotomicExp sets z=x**e and returns it
+// uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
+// TODO: use a windowed method
 func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
 	var res, xInv E12
 	xInv.InverseUnitary(x)
@@ -401,6 +404,74 @@ func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
 	return z
 }
 
+// ExpGLV sets z=x**e and returns it
+// uses 2-dimensional GLV with 2-bits windowed method
+// x must be in GT
+// TODO: use 2-NAF
+// TODO: use higher dimensional decomposition
+func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 {
+
+	var table [15]E12
+	var res E12
+	var k1, k2 fr.Element
+
+	res.SetOne()
+
+	// table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a
+	table[0].Set(a)
+	table[3].Frobenius(a)
+
+	// split the scalar, modifies +-x, Frob(x) accordingly
+	k := ecc.SplitScalar(s, &glvBasis)
+
+	if k[0].Sign() == -1 {
+		k[0].Neg(&k[0])
+		table[0].InverseUnitary(&table[0])
+	}
+	if k[1].Sign() == -1 {
+		k[1].Neg(&k[1])
+		table[3].InverseUnitary(&table[3])
+	}
+
+	// precompute table (2 bits sliding window)
+	// table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0
+	table[1].CyclotomicSquare(&table[0])
+	table[2].Mul(&table[1], &table[0])
+	table[4].Mul(&table[3], &table[0])
+	table[5].Mul(&table[3], &table[1])
+	table[6].Mul(&table[3], &table[2])
+	table[7].CyclotomicSquare(&table[3])
+	table[8].Mul(&table[7], &table[0])
+	table[9].Mul(&table[7], &table[1])
+	table[10].Mul(&table[7], &table[2])
+	table[11].Mul(&table[7], &table[3])
+	table[12].Mul(&table[11], &table[0])
+	table[13].Mul(&table[11], &table[1])
+	table[14].Mul(&table[11], &table[2])
+
+	// bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max
+	k1.SetBigInt(&k[0]).FromMont()
+	k2.SetBigInt(&k[1]).FromMont()
+
+	// loop starts from len(k1)/2 due to the bounds
+	for i := len(k1) / 2; i >= 0; i-- {
+		mask := uint64(3) << 62
+		for j := 0; j < 32; j++ {
+			res.CyclotomicSquare(&res).CyclotomicSquare(&res)
+			b1 := (k1[i] & mask) >> (62 - 2*j)
+			b2 := (k2[i] & mask) >> (62 - 2*j)
+			if b1|b2 != 0 {
+				s := (b2<<2 | b1)
+				res.Mul(&res, &table[s-1])
+			}
+			mask = mask >> 2
+		}
+	}
+
+	p.Set(&res)
+	return p
+}
+
 // InverseUnitary inverse a unitary element
 func (z *E12) InverseUnitary(x *E12) *E12 {
 	return z.Conjugate(x)

From 095417b3a4969dbdcc00c8112b8259bd8db5baa7 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Mon, 24 Jan 2022 16:29:52 +0100
Subject: [PATCH 04/16] feat(GT, bls12-377): bucket-list MSM

---
 ecc/bls12-377/internal/fptower/multiexp.go | 1229 ++++++++++++++++++++
 ecc/bls12-377/pairing_test.go              |  143 +++
 2 files changed, 1372 insertions(+)
 create mode 100644 ecc/bls12-377/internal/fptower/multiexp.go

diff --git a/ecc/bls12-377/internal/fptower/multiexp.go b/ecc/bls12-377/internal/fptower/multiexp.go
new file mode 100644
index 0000000000..37ef35ea04
--- /dev/null
+++ b/ecc/bls12-377/internal/fptower/multiexp.go
@@ -0,0 +1,1229 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package fptower
+
+import (
+	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bls12-377/fr"
+	"github.com/consensys/gnark-crypto/internal/parallel"
+	"math"
+	"runtime"
+)
+
+/* Multi-Exponentiation à la Pippenger */
+
+// selector stores the index, mask and shifts needed to select bits from a scalar
+// it is used during the multiExp algorithm or the batch scalar multiplication
+type selector struct {
+	index uint64 // index in the multi-word scalar to select bits from
+	mask  uint64 // mask (c-bit wide)
+	shift uint64 // shift needed to get our bits on low positions
+
+	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
+	maskHigh        uint64 // same than mask, for index+1
+	shiftHigh       uint64 // same than shift, for index+1
+}
+
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+	toReturn := make([]fr.Element, len(scalars))
+
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
+
+	mask := uint64((1 << c) - 1)      // low c bits are 1
+	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
+	max := int(1 << (c - 1))          // max value we want for our digits
+	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
+
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
+		}
+		selectors[chunk] = d
+	}
+
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
+
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
+
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.IsUint64() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
+
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
+
+				// init with carry if any
+				digit := carry
+				carry = 0
+
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
+
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
+
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
+
+				var bits uint64
+				if digit >= 0 {
+					bits = uint64(digit)
+				} else {
+					bits = uint64(-digit-1) | msbWindow
+				}
+
+				toReturn[i][s.index] |= (bits << s.shift)
+				if s.multiWordSelect {
+					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				}
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
+}
+
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+func (p *E12) MultiExp(points []E12, scalars []fr.Element, config ecc.MultiExpConfig) (*E12, error) {
+	// note:
+	// each of the MsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each MsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented MsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerE12 , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]E12, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerE12(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerE12(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.Mul(p, &_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerE12(p *E12, c int, points []E12, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.MsmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.MsmC5(points, scalars, splitFirstChunk)
+
+	case 6:
+		p.MsmC6(points, scalars, splitFirstChunk)
+
+	case 7:
+		p.MsmC7(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.MsmC8(points, scalars, splitFirstChunk)
+
+	case 9:
+		p.MsmC9(points, scalars, splitFirstChunk)
+
+	case 10:
+		p.MsmC10(points, scalars, splitFirstChunk)
+
+	case 11:
+		p.MsmC11(points, scalars, splitFirstChunk)
+
+	case 12:
+		p.MsmC12(points, scalars, splitFirstChunk)
+
+	case 13:
+		p.MsmC13(points, scalars, splitFirstChunk)
+
+	case 14:
+		p.MsmC14(points, scalars, splitFirstChunk)
+
+	case 15:
+		p.MsmC15(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.MsmC16(points, scalars, splitFirstChunk)
+
+	case 20:
+		p.MsmC20(points, scalars, splitFirstChunk)
+
+	case 21:
+		p.MsmC21(points, scalars, splitFirstChunk)
+
+	case 22:
+		p.MsmC22(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkE12 reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkE12(p *E12, c int, chChunks []chan E12) *E12 {
+	var _p E12
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.CyclotomicSquare(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.Mul(&_p, &totalj)
+	}
+
+	p.Set(&_p)
+	return p
+}
+
+func msmProcessChunkE12(chunk uint64,
+	chRes chan<- E12,
+	buckets []E12,
+	c uint64,
+	points []E12,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].SetOne()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		var tmp E12
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].Mul(&buckets[bits-1], &points[i])
+		} else {
+			// sub
+			tmp.Conjugate(&points[i])
+			buckets[bits & ^msbWindow].Mul(&buckets[bits & ^msbWindow], &tmp)
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var one, runningSum, total E12
+	runningSum.SetOne()
+	total.SetOne()
+	one.SetOne()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].Equal(&one) {
+			runningSum.Mul(&runningSum, &buckets[k])
+		}
+		total.Mul(&total, &runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *E12) MsmC4(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 4                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC5(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 5                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC6(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 6                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC7(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 7                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC8(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 8                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC9(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 9                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC10(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 10                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC11(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 11                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC12(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 12                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC13(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 13                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC14(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 14                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC15(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 15                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC16(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC20(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 20                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC21(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 21                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC22(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 22                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
diff --git a/ecc/bls12-377/pairing_test.go b/ecc/bls12-377/pairing_test.go
index 61af6eec91..7c7cf3be6c 100644
--- a/ecc/bls12-377/pairing_test.go
+++ b/ecc/bls12-377/pairing_test.go
@@ -19,8 +19,11 @@ package bls12377
 import (
 	"fmt"
 	"math/big"
+	"math/bits"
+	"runtime"
 	"testing"
 
+	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-377/fp"
 	"github.com/consensys/gnark-crypto/ecc/bls12-377/fr"
 	"github.com/leanovate/gopter"
@@ -232,6 +235,112 @@ func TestPairing(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestMultiExpGT(t *testing.T) {
+
+	parameters := gopter.DefaultTestParameters()
+	parameters.MinSuccessfulTests = 2
+
+	properties := gopter.NewProperties(parameters)
+
+	genScalar := GenFr()
+
+	// size of the multiExps
+	const nbSamples = 143
+
+	// multi exp points
+	var samplePoints [nbSamples]GT
+	var _g, g GT
+	_g.SetRandom()
+
+	// put into GT
+	_g = FinalExponentiation(&_g)
+
+	g.Set(&_g)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].Set(&g)
+		g.Mul(&g, &_g)
+	}
+
+	// final scalar to use in double and add method (without mixer factor)
+	// n(n+1)(2n+1)/6  (sum of the squares from 1 to n)
+	var scalar big.Int
+	scalar.SetInt64(nbSamples)
+	scalar.Mul(&scalar, new(big.Int).SetInt64(nbSamples+1))
+	scalar.Mul(&scalar, new(big.Int).SetInt64(2*nbSamples+1))
+	scalar.Div(&scalar, new(big.Int).SetInt64(6))
+
+	// ensure a multiexp that's splitted has the same result as a non-splitted one..
+	properties.Property("[GT] Multi exponentation (c=16) should be consistant with splitted multiexp", prop.ForAll(
+		func(mixer fr.Element) bool {
+			var samplePointsLarge [nbSamples * 13]GT
+			for i := 0; i < 13; i++ {
+				copy(samplePointsLarge[i*nbSamples:], samplePoints[:])
+			}
+
+			var r16, splitted1, splitted2 GT
+
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples * 13]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
+			r16.MsmC16(samplePoints[:], scalars16, true)
+
+			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
+			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
+			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
+		},
+		genScalar,
+	))
+
+	// note : this test is here as we expect to have a different multiExp than the above bucket method
+	// for small number of points
+	properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll(
+		func(mixer fr.Element) bool {
+
+			var _g, g GT
+			_g.SetRandom()
+
+			// put into GT
+			_g = FinalExponentiation(&_g)
+
+			g.Set(&_g)
+
+			// mixer ensures that all the words of a fpElement are set
+			samplePoints := make([]GT, 30)
+			sampleScalars := make([]fr.Element, 30)
+
+			for i := 1; i <= 30; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+				samplePoints[i-1].Set(&g)
+				g.Mul(&g, &_g)
+			}
+
+			var op1MultiExp GT
+			op1MultiExp.MultiExp(samplePoints, sampleScalars, ecc.MultiExpConfig{})
+
+			var finalBigScalar fr.Element
+			var finalBigScalarBi big.Int
+			var op1ScalarMul GT
+			finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer)
+			finalBigScalar.ToBigIntRegular(&finalBigScalarBi)
+			op1ScalarMul.ExpGLV(&_g, &finalBigScalarBi)
+
+			return op1ScalarMul.Equal(&op1MultiExp)
+		},
+		genScalar,
+	))
+
+	properties.TestingRun(t, gopter.ConsoleReporter(false))
+}
+
 // ------------------------------------------------------------
 // benches
 
@@ -362,3 +471,37 @@ func BenchmarkExpGT(b *testing.B) {
 		}
 	})
 }
+
+func BenchmarkMultiExpGT(b *testing.B) {
+	// ensure every words of the scalars are filled
+	var mixer fr.Element
+	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
+
+	const pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits
+	const nbSamples = 1 << pow
+
+	var samplePoints [nbSamples]GT
+	var sampleScalars [nbSamples]fr.Element
+	var _g GT
+	_g.SetRandom()
+
+	for i := 1; i <= nbSamples; i++ {
+		sampleScalars[i-1].SetUint64(uint64(i)).
+			Mul(&sampleScalars[i-1], &mixer).
+			FromMont()
+		samplePoints[i-1].Set(&_g)
+	}
+
+	var testPoint GT
+
+	for i := 5; i <= pow; i++ {
+		using := 1 << i
+
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
+	}
+}

From f993ed9a513120437d339bacce87d1f38490f942 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Tue, 22 Mar 2022 17:57:28 +0100
Subject: [PATCH 05/16] fix: parameters of ExpGLV for the new curves

---
 ecc/bls12-378/internal/fptower/e12.go        | 114 ++++++++++++++++++-
 ecc/bls12-378/internal/fptower/e12_test.go   |  25 ++++
 ecc/bls12-378/internal/fptower/parameters.go |  33 ++++++
 ecc/bls12-378/pairing_test.go                |  59 ++++++++++
 ecc/bw6-633/internal/fptower/parameters.go   |   2 +-
 ecc/bw6-756/internal/fptower/e6.go           | 114 ++++++++++++++++++-
 ecc/bw6-756/internal/fptower/parameters.go   |  33 ++++++
 ecc/bw6-756/pairing_test.go                  |  59 ++++++++++
 ecc/bw6-761/internal/fptower/e3.go           |   4 +-
 ecc/bw6-761/internal/fptower/parameters.go   |   2 +-
 10 files changed, 429 insertions(+), 16 deletions(-)
 create mode 100644 ecc/bls12-378/internal/fptower/parameters.go
 create mode 100644 ecc/bw6-756/internal/fptower/parameters.go

diff --git a/ecc/bls12-378/internal/fptower/e12.go b/ecc/bls12-378/internal/fptower/e12.go
index 07c716fbe4..6f7da023cd 100644
--- a/ecc/bls12-378/internal/fptower/e12.go
+++ b/ecc/bls12-378/internal/fptower/e12.go
@@ -19,7 +19,9 @@ package fptower
 import (
 	"encoding/binary"
 	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-378/fp"
+	"github.com/consensys/gnark-crypto/ecc/bls12-378/fr"
 	"math/big"
 )
 
@@ -368,25 +370,125 @@ func (z *E12) Inverse(x *E12) *E12 {
 }
 
 // Exp sets z=x**e and returns it
+// uses 2-bits windowed method
 func (z *E12) Exp(x *E12, e big.Int) *E12 {
+
 	var res E12
+	var ops [3]E12
+
 	res.SetOne()
+	ops[0].Set(x)
+	ops[1].Square(&ops[0])
+	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
+
 	b := e.Bytes()
 	for i := range b {
 		w := b[i]
-		mask := byte(0x80)
-		for j := 7; j >= 0; j-- {
-			res.Square(&res)
-			if (w&mask)>>j != 0 {
-				res.Mul(&res, x)
+		mask := byte(0xc0)
+		for j := 0; j < 4; j++ {
+			res.Square(&res).Square(&res)
+			c := (w & mask) >> (6 - 2*j)
+			if c != 0 {
+				res.Mul(&res, &ops[c-1])
 			}
-			mask = mask >> 1
+			mask = mask >> 2
+		}
+	}
+	z.Set(&res)
+
+	return z
+}
+
+// CyclotomicExp sets z=x**e and returns it
+// uses 2-NAF decomposition
+// x must be in the cyclotomic subgroup
+// TODO: use a windowed method
+func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
+	var res, xInv E12
+	xInv.InverseUnitary(x)
+	res.SetOne()
+	eNAF := make([]int8, e.BitLen()+3)
+	n := ecc.NafDecomposition(&e, eNAF[:])
+	for i := n - 1; i >= 0; i-- {
+		res.CyclotomicSquare(&res)
+		if eNAF[i] == 1 {
+			res.Mul(&res, x)
+		} else if eNAF[i] == -1 {
+			res.Mul(&res, &xInv)
 		}
 	}
 	z.Set(&res)
 	return z
 }
 
+// ExpGLV sets z=x**e and returns it
+// uses 2-dimensional GLV with 2-bits windowed method
+// x must be in GT
+// TODO: use 2-NAF
+// TODO: use higher dimensional decomposition
+func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 {
+
+	var table [15]E12
+	var res E12
+	var k1, k2 fr.Element
+
+	res.SetOne()
+
+	// table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a
+	table[0].Set(a)
+	table[3].Frobenius(a)
+
+	// split the scalar, modifies +-x, Frob(x) accordingly
+	k := ecc.SplitScalar(s, &glvBasis)
+
+	if k[0].Sign() == -1 {
+		k[0].Neg(&k[0])
+		table[0].InverseUnitary(&table[0])
+	}
+	if k[1].Sign() == -1 {
+		k[1].Neg(&k[1])
+		table[3].InverseUnitary(&table[3])
+	}
+
+	// precompute table (2 bits sliding window)
+	// table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0
+	table[1].CyclotomicSquare(&table[0])
+	table[2].Mul(&table[1], &table[0])
+	table[4].Mul(&table[3], &table[0])
+	table[5].Mul(&table[3], &table[1])
+	table[6].Mul(&table[3], &table[2])
+	table[7].CyclotomicSquare(&table[3])
+	table[8].Mul(&table[7], &table[0])
+	table[9].Mul(&table[7], &table[1])
+	table[10].Mul(&table[7], &table[2])
+	table[11].Mul(&table[7], &table[3])
+	table[12].Mul(&table[11], &table[0])
+	table[13].Mul(&table[11], &table[1])
+	table[14].Mul(&table[11], &table[2])
+
+	// bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max
+	k1.SetBigInt(&k[0]).FromMont()
+	k2.SetBigInt(&k[1]).FromMont()
+
+	// loop starts from len(k1)/2 due to the bounds
+	for i := len(k1) / 2; i >= 0; i-- {
+		mask := uint64(3) << 62
+		for j := 0; j < 32; j++ {
+			res.CyclotomicSquare(&res).CyclotomicSquare(&res)
+			b1 := (k1[i] & mask) >> (62 - 2*j)
+			b2 := (k2[i] & mask) >> (62 - 2*j)
+			if b1|b2 != 0 {
+				s := (b2<<2 | b1)
+				res.Mul(&res, &table[s-1])
+			}
+			mask = mask >> 2
+		}
+	}
+
+	p.Set(&res)
+	return p
+}
+
 // InverseUnitary inverse a unitary element
 func (z *E12) InverseUnitary(x *E12) *E12 {
 	return z.Conjugate(x)
diff --git a/ecc/bls12-378/internal/fptower/e12_test.go b/ecc/bls12-378/internal/fptower/e12_test.go
index 6f5a6ee749..1830c3fadd 100644
--- a/ecc/bls12-378/internal/fptower/e12_test.go
+++ b/ecc/bls12-378/internal/fptower/e12_test.go
@@ -17,6 +17,7 @@
 package fptower
 
 import (
+	"math/big"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-378/fp"
@@ -192,6 +193,7 @@ func TestE12Ops(t *testing.T) {
 
 	genA := GenE12()
 	genB := GenE12()
+	genExp := GenFp()
 
 	properties.Property("[BLS12-378] sub & add should leave an element invariant", prop.ForAll(
 		func(a, b *E12) bool {
@@ -349,6 +351,29 @@ func TestE12Ops(t *testing.T) {
 		genA,
 	))
 
+	properties.Property("[BLS12-378] Exp and CyclotomicExp results must be the same in the cyclotomic subgroup", prop.ForAll(
+		func(a *E12, e fp.Element) bool {
+			var b, c, d E12
+			// put in the cyclo subgroup
+			b.Conjugate(a)
+			a.Inverse(a)
+			b.Mul(&b, a)
+			a.FrobeniusSquare(&b).Mul(a, &b)
+
+			var _e big.Int
+			k := new(big.Int).SetUint64(12)
+			e.Exp(e, k)
+			e.ToBigIntRegular(&_e)
+
+			c.Exp(a, _e)
+			d.CyclotomicExp(a, _e)
+
+			return c.Equal(&d)
+		},
+		genA,
+		genExp,
+	))
+
 	properties.Property("[BLS12-378] Frobenius of x in E12 should be equal to x^q", prop.ForAll(
 		func(a *E12) bool {
 			var b, c E12
diff --git a/ecc/bls12-378/internal/fptower/parameters.go b/ecc/bls12-378/internal/fptower/parameters.go
new file mode 100644
index 0000000000..7d5ea1a4c3
--- /dev/null
+++ b/ecc/bls12-378/internal/fptower/parameters.go
@@ -0,0 +1,33 @@
+// Copyright 2020 ConsenSys AG
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fptower
+
+import (
+	"math/big"
+
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bls12-378/fr"
+)
+
+// generator of the curve
+var xGen big.Int
+
+var glvBasis ecc.Lattice
+
+func init() {
+	xGen.SetString("11045256207009841153", 10)
+	_r := fr.Modulus()
+	ecc.PrecomputeLattice(_r, &xGen, &glvBasis)
+}
diff --git a/ecc/bls12-378/pairing_test.go b/ecc/bls12-378/pairing_test.go
index 5dd1db4415..89c3bafc3e 100644
--- a/ecc/bls12-378/pairing_test.go
+++ b/ecc/bls12-378/pairing_test.go
@@ -21,6 +21,7 @@ import (
 	"math/big"
 	"testing"
 
+	"github.com/consensys/gnark-crypto/ecc/bls12-378/fp"
 	"github.com/consensys/gnark-crypto/ecc/bls12-378/fr"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
@@ -39,6 +40,7 @@ func TestPairing(t *testing.T) {
 	genA := GenE12()
 	genR1 := GenFr()
 	genR2 := GenFr()
+	genP := GenFp()
 
 	properties.Property("[BLS12-378] Having the receiver as operand (final expo) should output the same result", prop.ForAll(
 		func(a GT) bool {
@@ -58,6 +60,27 @@ func TestPairing(t *testing.T) {
 		genA,
 	))
 
+	properties.Property("[BLS12-378] Exp, CyclotomicExp and ExpGLV results must be the same in GT", prop.ForAll(
+		func(a GT, e fp.Element) bool {
+			a = FinalExponentiation(&a)
+
+			var _e big.Int
+
+			k := new(big.Int).SetUint64(12)
+			e.Exp(e, k)
+			e.ToBigIntRegular(&_e)
+
+			var b, c, d GT
+			b.Exp(&a, _e)
+			c.ExpGLV(&a, &_e)
+			d.CyclotomicExp(&a, _e)
+
+			return b.Equal(&c) && c.Equal(&d)
+		},
+		genA,
+		genP,
+	))
+
 	properties.Property("[BLS12-378] Expt(Expt) and Exp(t^2) should output the same result in the cyclotomic subgroup", prop.ForAll(
 		func(a GT) bool {
 			var b, c, d GT
@@ -303,3 +326,39 @@ func BenchmarkMultiPair(b *testing.B) {
 		})
 	}
 }
+
+func BenchmarkExpGT(b *testing.B) {
+
+	var a GT
+	a.SetRandom()
+	a = FinalExponentiation(&a)
+
+	var e fp.Element
+	e.SetRandom()
+
+	k := new(big.Int).SetUint64(12)
+	e.Exp(e, k)
+	var _e big.Int
+	e.ToBigIntRegular(&_e)
+
+	b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.Exp(&a, _e)
+		}
+	})
+
+	b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.CyclotomicExp(&a, _e)
+		}
+	})
+
+	b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.ExpGLV(&a, &_e)
+		}
+	})
+}
diff --git a/ecc/bw6-633/internal/fptower/parameters.go b/ecc/bw6-633/internal/fptower/parameters.go
index a929cac1f1..308498b0c6 100644
--- a/ecc/bw6-633/internal/fptower/parameters.go
+++ b/ecc/bw6-633/internal/fptower/parameters.go
@@ -21,7 +21,7 @@ import (
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fr"
 )
 
-// generator of the curve
+// t-1
 var xGen big.Int
 
 var glvBasis ecc.Lattice
diff --git a/ecc/bw6-756/internal/fptower/e6.go b/ecc/bw6-756/internal/fptower/e6.go
index 7a794fb0cc..76e9f7b1dd 100644
--- a/ecc/bw6-756/internal/fptower/e6.go
+++ b/ecc/bw6-756/internal/fptower/e6.go
@@ -20,6 +20,7 @@ import (
 	"errors"
 	"math/big"
 
+	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-756/fp"
 	"github.com/consensys/gnark-crypto/ecc/bw6-756/fr"
 )
@@ -311,25 +312,125 @@ func (z *E6) Inverse(x *E6) *E6 {
 }
 
 // Exp sets z=x**e and returns it
+// uses 2-bits windowed method
 func (z *E6) Exp(x *E6, e big.Int) *E6 {
+
 	var res E6
+	var ops [3]E6
+
 	res.SetOne()
+	ops[0].Set(x)
+	ops[1].Square(&ops[0])
+	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
+
 	b := e.Bytes()
 	for i := range b {
 		w := b[i]
-		mask := byte(0x80)
-		for j := 7; j >= 0; j-- {
-			res.Square(&res)
-			if (w&mask)>>j != 0 {
-				res.Mul(&res, x)
+		mask := byte(0xc0)
+		for j := 0; j < 4; j++ {
+			res.Square(&res).Square(&res)
+			c := (w & mask) >> (6 - 2*j)
+			if c != 0 {
+				res.Mul(&res, &ops[c-1])
 			}
-			mask = mask >> 1
+			mask = mask >> 2
+		}
+	}
+	z.Set(&res)
+
+	return z
+}
+
+// CyclotomicExp sets z=x**e and returns it
+// uses 2-NAF decomposition
+// x must be in the cyclotomic subgroup
+// TODO: use a windowed method
+func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 {
+	var res, xInv E6
+	xInv.InverseUnitary(x)
+	res.SetOne()
+	eNAF := make([]int8, e.BitLen()+3)
+	n := ecc.NafDecomposition(&e, eNAF[:])
+	for i := n - 1; i >= 0; i-- {
+		res.CyclotomicSquare(&res)
+		if eNAF[i] == 1 {
+			res.Mul(&res, x)
+		} else if eNAF[i] == -1 {
+			res.Mul(&res, &xInv)
 		}
 	}
 	z.Set(&res)
 	return z
 }
 
+// ExpGLV sets z=x**e and returns it
+// uses 2-dimensional GLV with 2-bits windowed method
+// x must be in GT
+// TODO: use 2-NAF
+// TODO: use higher dimensional decomposition
+func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 {
+
+	var table [15]E6
+	var res E6
+	var k1, k2 fr.Element
+
+	res.SetOne()
+
+	// table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a
+	table[0].Set(a)
+	table[3].Frobenius(a)
+
+	// split the scalar, modifies +-x, Frob(x) accordingly
+	k := ecc.SplitScalar(s, &glvBasis)
+
+	if k[0].Sign() == -1 {
+		k[0].Neg(&k[0])
+		table[0].InverseUnitary(&table[0])
+	}
+	if k[1].Sign() == -1 {
+		k[1].Neg(&k[1])
+		table[3].InverseUnitary(&table[3])
+	}
+
+	// precompute table (2 bits sliding window)
+	// table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0
+	table[1].CyclotomicSquare(&table[0])
+	table[2].Mul(&table[1], &table[0])
+	table[4].Mul(&table[3], &table[0])
+	table[5].Mul(&table[3], &table[1])
+	table[6].Mul(&table[3], &table[2])
+	table[7].CyclotomicSquare(&table[3])
+	table[8].Mul(&table[7], &table[0])
+	table[9].Mul(&table[7], &table[1])
+	table[10].Mul(&table[7], &table[2])
+	table[11].Mul(&table[7], &table[3])
+	table[12].Mul(&table[11], &table[0])
+	table[13].Mul(&table[11], &table[1])
+	table[14].Mul(&table[11], &table[2])
+
+	// bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max
+	k1.SetBigInt(&k[0]).FromMont()
+	k2.SetBigInt(&k[1]).FromMont()
+
+	// loop starts from len(k1)/2 due to the bounds
+	for i := len(k1) / 2; i >= 0; i-- {
+		mask := uint64(3) << 62
+		for j := 0; j < 32; j++ {
+			res.CyclotomicSquare(&res).CyclotomicSquare(&res)
+			b1 := (k1[i] & mask) >> (62 - 2*j)
+			b2 := (k2[i] & mask) >> (62 - 2*j)
+			if b1|b2 != 0 {
+				s := (b2<<2 | b1)
+				res.Mul(&res, &table[s-1])
+			}
+			mask = mask >> 2
+		}
+	}
+
+	p.Set(&res)
+	return p
+}
+
 // InverseUnitary inverse a unitary element
 func (z *E6) InverseUnitary(x *E6) *E6 {
 	return z.Conjugate(x)
@@ -404,6 +505,7 @@ func (z *E6) SetBytes(e []byte) error {
 }
 
 // IsInSubGroup ensures GT/E6 is in correct sugroup
+// TODO: optimize
 func (z *E6) IsInSubGroup() bool {
 	var one, _z E6
 	one.SetOne()
diff --git a/ecc/bw6-756/internal/fptower/parameters.go b/ecc/bw6-756/internal/fptower/parameters.go
new file mode 100644
index 0000000000..8a8ce6f783
--- /dev/null
+++ b/ecc/bw6-756/internal/fptower/parameters.go
@@ -0,0 +1,33 @@
+// Copyright 2020 ConsenSys AG
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fptower
+
+import (
+	"math/big"
+
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bw6-756/fr"
+)
+
+// t-1
+var xGen big.Int
+
+var glvBasis ecc.Lattice
+
+func init() {
+	xGen.SetString("164391353554439166353793911729193406645071739502673898176639736370075683438438023898983435337730", 10)
+	_r := fr.Modulus()
+	ecc.PrecomputeLattice(_r, &xGen, &glvBasis)
+}
diff --git a/ecc/bw6-756/pairing_test.go b/ecc/bw6-756/pairing_test.go
index 7db065814e..21c46affe0 100644
--- a/ecc/bw6-756/pairing_test.go
+++ b/ecc/bw6-756/pairing_test.go
@@ -21,6 +21,7 @@ import (
 	"math/big"
 	"testing"
 
+	"github.com/consensys/gnark-crypto/ecc/bw6-756/fp"
 	"github.com/consensys/gnark-crypto/ecc/bw6-756/fr"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
@@ -40,6 +41,7 @@ func TestPairing(t *testing.T) {
 
 	genR1 := GenFr()
 	genR2 := GenFr()
+	genP := GenFp()
 
 	properties.Property("[BW6-756] Having the receiver as operand (final expo) should output the same result", prop.ForAll(
 		func(a GT) bool {
@@ -59,6 +61,27 @@ func TestPairing(t *testing.T) {
 		genA,
 	))
 
+	properties.Property("[BW6-756] Exp, CyclotomicExp and ExpGLV results must be the same in GT", prop.ForAll(
+		func(a GT, e fp.Element) bool {
+			a = FinalExponentiation(&a)
+
+			var _e big.Int
+
+			k := new(big.Int).SetUint64(12)
+			e.Exp(e, k)
+			e.ToBigIntRegular(&_e)
+
+			var b, c, d GT
+			b.Exp(&a, _e)
+			c.ExpGLV(&a, &_e)
+			d.CyclotomicExp(&a, _e)
+
+			return b.Equal(&c) && c.Equal(&d)
+		},
+		genA,
+		genP,
+	))
+
 	properties.Property("[BW6-756] Expt(Expt) and Exp(t^2) should output the same result in the cyclotomic subgroup", prop.ForAll(
 		func(a GT) bool {
 			var b, c, d GT
@@ -304,3 +327,39 @@ func BenchmarkMultiPair(b *testing.B) {
 		})
 	}
 }
+
+func BenchmarkExpGT(b *testing.B) {
+
+	var a GT
+	a.SetRandom()
+	a = FinalExponentiation(&a)
+
+	var e fp.Element
+	e.SetRandom()
+
+	k := new(big.Int).SetUint64(12)
+	e.Exp(e, k)
+	var _e big.Int
+	e.ToBigIntRegular(&_e)
+
+	b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.Exp(&a, _e)
+		}
+	})
+
+	b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.CyclotomicExp(&a, _e)
+		}
+	})
+
+	b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) {
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			a.ExpGLV(&a, &_e)
+		}
+	})
+}
diff --git a/ecc/bw6-761/internal/fptower/e3.go b/ecc/bw6-761/internal/fptower/e3.go
index 29990fe6bb..37eb4e2c67 100644
--- a/ecc/bw6-761/internal/fptower/e3.go
+++ b/ecc/bw6-761/internal/fptower/e3.go
@@ -16,7 +16,7 @@ import (
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fp"
 )
 
-// E3 is a degree-three finite field extension of fp2
+// E3 is a degree-three finite field extension of fp3
 type E3 struct {
 	A0, A1, A2 fp.Element
 }
@@ -27,7 +27,7 @@ func (z *E3) Equal(x *E3) bool {
 	return z.A0.Equal(&x.A0) && z.A1.Equal(&x.A1) && z.A2.Equal(&x.A2)
 }
 
-// SetString sets a E3 elmt from stringf
+// SetString sets a E3 elmt from string
 func (z *E3) SetString(s1, s2, s3 string) *E3 {
 	z.A0.SetString(s1)
 	z.A1.SetString(s2)
diff --git a/ecc/bw6-761/internal/fptower/parameters.go b/ecc/bw6-761/internal/fptower/parameters.go
index 2ec4ef2b19..8990cd62ea 100644
--- a/ecc/bw6-761/internal/fptower/parameters.go
+++ b/ecc/bw6-761/internal/fptower/parameters.go
@@ -21,7 +21,7 @@ import (
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fr"
 )
 
-// generator of the curve
+// t-1
 var xGen big.Int
 
 var glvBasis ecc.Lattice

From 61d93f7804ffc6c9825050bd6fdb61d5f98ec6ea Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Wed, 23 Mar 2022 15:14:59 +0100
Subject: [PATCH 06/16] style: unnecessary use of fmt.Sprintf

---
 ecc/bls12-377/pairing_test.go                             | 6 +++---
 ecc/bls12-378/pairing_test.go                             | 6 +++---
 ecc/bls12-381/pairing_test.go                             | 6 +++---
 ecc/bls24-315/pairing_test.go                             | 6 +++---
 ecc/bn254/pairing_test.go                                 | 6 +++---
 ecc/bw6-633/pairing_test.go                               | 6 +++---
 ecc/bw6-756/pairing_test.go                               | 6 +++---
 ecc/bw6-761/pairing_test.go                               | 6 +++---
 internal/generator/pairing/template/tests/pairing.go.tmpl | 6 +++---
 9 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/ecc/bls12-377/pairing_test.go b/ecc/bls12-377/pairing_test.go
index 61af6eec91..84df59ef04 100644
--- a/ecc/bls12-377/pairing_test.go
+++ b/ecc/bls12-377/pairing_test.go
@@ -341,21 +341,21 @@ func BenchmarkExpGT(b *testing.B) {
 	var _e big.Int
 	e.ToBigIntRegular(&_e)
 
-	b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) {
+	b.Run("Naive windowed Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.Exp(&a, _e)
 		}
 	})
 
-	b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) {
+	b.Run("2-NAF cyclotomic Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.CyclotomicExp(&a, _e)
 		}
 	})
 
-	b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) {
+	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.ExpGLV(&a, &_e)
diff --git a/ecc/bls12-378/pairing_test.go b/ecc/bls12-378/pairing_test.go
index 89c3bafc3e..4567591fd7 100644
--- a/ecc/bls12-378/pairing_test.go
+++ b/ecc/bls12-378/pairing_test.go
@@ -341,21 +341,21 @@ func BenchmarkExpGT(b *testing.B) {
 	var _e big.Int
 	e.ToBigIntRegular(&_e)
 
-	b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) {
+	b.Run("Naive windowed Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.Exp(&a, _e)
 		}
 	})
 
-	b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) {
+	b.Run("2-NAF cyclotomic Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.CyclotomicExp(&a, _e)
 		}
 	})
 
-	b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) {
+	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.ExpGLV(&a, &_e)
diff --git a/ecc/bls12-381/pairing_test.go b/ecc/bls12-381/pairing_test.go
index c45923175d..ddd4eb0099 100644
--- a/ecc/bls12-381/pairing_test.go
+++ b/ecc/bls12-381/pairing_test.go
@@ -341,21 +341,21 @@ func BenchmarkExpGT(b *testing.B) {
 	var _e big.Int
 	e.ToBigIntRegular(&_e)
 
-	b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) {
+	b.Run("Naive windowed Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.Exp(&a, _e)
 		}
 	})
 
-	b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) {
+	b.Run("2-NAF cyclotomic Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.CyclotomicExp(&a, _e)
 		}
 	})
 
-	b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) {
+	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.ExpGLV(&a, &_e)
diff --git a/ecc/bls24-315/pairing_test.go b/ecc/bls24-315/pairing_test.go
index 7f2b9e04df..df0fa0fa87 100644
--- a/ecc/bls24-315/pairing_test.go
+++ b/ecc/bls24-315/pairing_test.go
@@ -344,21 +344,21 @@ func BenchmarkExpGT(b *testing.B) {
 	var _e big.Int
 	e.ToBigIntRegular(&_e)
 
-	b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) {
+	b.Run("Naive windowed Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.Exp(&a, _e)
 		}
 	})
 
-	b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) {
+	b.Run("2-NAF cyclotomic Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.CyclotomicExp(&a, _e)
 		}
 	})
 
-	b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) {
+	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.ExpGLV(&a, &_e)
diff --git a/ecc/bn254/pairing_test.go b/ecc/bn254/pairing_test.go
index a875603cde..01f6db52b9 100644
--- a/ecc/bn254/pairing_test.go
+++ b/ecc/bn254/pairing_test.go
@@ -341,21 +341,21 @@ func BenchmarkExpGT(b *testing.B) {
 	var _e big.Int
 	e.ToBigIntRegular(&_e)
 
-	b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) {
+	b.Run("Naive windowed Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.Exp(&a, _e)
 		}
 	})
 
-	b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) {
+	b.Run("2-NAF cyclotomic Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.CyclotomicExp(&a, _e)
 		}
 	})
 
-	b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) {
+	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.ExpGLV(&a, &_e)
diff --git a/ecc/bw6-633/pairing_test.go b/ecc/bw6-633/pairing_test.go
index fef2c615a4..707ecf0272 100644
--- a/ecc/bw6-633/pairing_test.go
+++ b/ecc/bw6-633/pairing_test.go
@@ -344,21 +344,21 @@ func BenchmarkExpGT(b *testing.B) {
 	var _e big.Int
 	e.ToBigIntRegular(&_e)
 
-	b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) {
+	b.Run("Naive windowed Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.Exp(&a, _e)
 		}
 	})
 
-	b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) {
+	b.Run("2-NAF cyclotomic Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.CyclotomicExp(&a, _e)
 		}
 	})
 
-	b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) {
+	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.ExpGLV(&a, &_e)
diff --git a/ecc/bw6-756/pairing_test.go b/ecc/bw6-756/pairing_test.go
index 21c46affe0..c586872320 100644
--- a/ecc/bw6-756/pairing_test.go
+++ b/ecc/bw6-756/pairing_test.go
@@ -342,21 +342,21 @@ func BenchmarkExpGT(b *testing.B) {
 	var _e big.Int
 	e.ToBigIntRegular(&_e)
 
-	b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) {
+	b.Run("Naive windowed Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.Exp(&a, _e)
 		}
 	})
 
-	b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) {
+	b.Run("2-NAF cyclotomic Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.CyclotomicExp(&a, _e)
 		}
 	})
 
-	b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) {
+	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.ExpGLV(&a, &_e)
diff --git a/ecc/bw6-761/pairing_test.go b/ecc/bw6-761/pairing_test.go
index 14dc68d169..d86c3e9729 100644
--- a/ecc/bw6-761/pairing_test.go
+++ b/ecc/bw6-761/pairing_test.go
@@ -344,21 +344,21 @@ func BenchmarkExpGT(b *testing.B) {
 	var _e big.Int
 	e.ToBigIntRegular(&_e)
 
-	b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) {
+	b.Run("Naive windowed Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.Exp(&a, _e)
 		}
 	})
 
-	b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) {
+	b.Run("2-NAF cyclotomic Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.CyclotomicExp(&a, _e)
 		}
 	})
 
-	b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) {
+	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.ExpGLV(&a, &_e)
diff --git a/internal/generator/pairing/template/tests/pairing.go.tmpl b/internal/generator/pairing/template/tests/pairing.go.tmpl
index bf655a4f3f..fe0fe1519e 100644
--- a/internal/generator/pairing/template/tests/pairing.go.tmpl
+++ b/internal/generator/pairing/template/tests/pairing.go.tmpl
@@ -344,21 +344,21 @@ func BenchmarkExpGT(b *testing.B) {
 	var _e big.Int
 	e.ToBigIntRegular(&_e)
 
-	b.Run(fmt.Sprintf("Naive windowed Exp"), func(b *testing.B) {
+	b.Run("Naive windowed Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.Exp(&a, _e)
 		}
 	})
 
-	b.Run(fmt.Sprintf("2-NAF cyclotomic Exp"), func(b *testing.B) {
+	b.Run("2-NAF cyclotomic Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.CyclotomicExp(&a, _e)
 		}
 	})
 
-	b.Run(fmt.Sprintf("windowed 2-dim GLV Exp"), func(b *testing.B) {
+	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			a.ExpGLV(&a, &_e)

From 8022ee1e012b2bf123f590cb5910e55ff5e0d862 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Wed, 23 Mar 2022 16:02:55 +0100
Subject: [PATCH 07/16] test: MSM5 and MSM5 for GT

---
 ecc/bls12-377/pairing_test.go | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/ecc/bls12-377/pairing_test.go b/ecc/bls12-377/pairing_test.go
index 8068bd8a35..ea32df9e11 100644
--- a/ecc/bls12-377/pairing_test.go
+++ b/ecc/bls12-377/pairing_test.go
@@ -298,6 +298,40 @@ func TestMultiExpGT(t *testing.T) {
 		genScalar,
 	))
 
+	// we test only c = 5 and c = 16
+	properties.Property("[GT] Multi exponentation (c=5, c=16) should be consistant with sum of square", prop.ForAll(
+		func(mixer fr.Element) bool {
+
+			var expected, g GT
+			g.SetRandom()
+			// put into GT
+			g = FinalExponentiation(&_g)
+
+			// compute expected result with double and add
+			var finalScalar, mixerBigInt big.Int
+			finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt))
+			expected.ExpGLV(&_g, &finalScalar)
+
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			scalars5, _ := partitionScalars(sampleScalars[:], 5, false, runtime.NumCPU())
+			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
+
+			var r5, r16 GT
+			r5.MsmC5(samplePoints[:], scalars5, false)
+			r16.MsmC16(samplePoints[:], scalars16, true)
+			return (r5.Equal(&expected) && r16.Equal(&expected))
+		},
+		genScalar,
+	))
+
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll(

From c657828f9a91c1d35348e3205a9f4b57e90896c0 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Wed, 23 Mar 2022 16:07:31 +0100
Subject: [PATCH 08/16] feat: GT-MSM for BLS12-378

---
 ecc/bls12-378/internal/fptower/multiexp.go | 1229 ++++++++++++++++++++
 ecc/bls12-378/pairing_test.go              |  176 +++
 2 files changed, 1405 insertions(+)
 create mode 100644 ecc/bls12-378/internal/fptower/multiexp.go

diff --git a/ecc/bls12-378/internal/fptower/multiexp.go b/ecc/bls12-378/internal/fptower/multiexp.go
new file mode 100644
index 0000000000..2c1feb87c5
--- /dev/null
+++ b/ecc/bls12-378/internal/fptower/multiexp.go
@@ -0,0 +1,1229 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package fptower
+
+import (
+	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bls12-378/fr"
+	"github.com/consensys/gnark-crypto/internal/parallel"
+	"math"
+	"runtime"
+)
+
+/* Multi-Exponentiation à la Pippenger */
+
+// selector stores the index, mask and shifts needed to select bits from a scalar
+// it is used during the multiExp algorithm or the batch scalar multiplication
+type selector struct {
+	index uint64 // index in the multi-word scalar to select bits from
+	mask  uint64 // mask (c-bit wide)
+	shift uint64 // shift needed to get our bits on low positions
+
+	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
+	maskHigh        uint64 // same than mask, for index+1
+	shiftHigh       uint64 // same than shift, for index+1
+}
+
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+	toReturn := make([]fr.Element, len(scalars))
+
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
+
+	mask := uint64((1 << c) - 1)      // low c bits are 1
+	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
+	max := int(1 << (c - 1))          // max value we want for our digits
+	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
+
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
+		}
+		selectors[chunk] = d
+	}
+
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
+
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
+
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.IsUint64() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
+
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
+
+				// init with carry if any
+				digit := carry
+				carry = 0
+
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
+
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
+
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
+
+				var bits uint64
+				if digit >= 0 {
+					bits = uint64(digit)
+				} else {
+					bits = uint64(-digit-1) | msbWindow
+				}
+
+				toReturn[i][s.index] |= (bits << s.shift)
+				if s.multiWordSelect {
+					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				}
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
+}
+
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+func (p *E12) MultiExp(points []E12, scalars []fr.Element, config ecc.MultiExpConfig) (*E12, error) {
+	// note:
+	// each of the MsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each MsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented MsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerE12 , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]E12, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerE12(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerE12(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.Mul(p, &_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerE12(p *E12, c int, points []E12, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.MsmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.MsmC5(points, scalars, splitFirstChunk)
+
+	case 6:
+		p.MsmC6(points, scalars, splitFirstChunk)
+
+	case 7:
+		p.MsmC7(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.MsmC8(points, scalars, splitFirstChunk)
+
+	case 9:
+		p.MsmC9(points, scalars, splitFirstChunk)
+
+	case 10:
+		p.MsmC10(points, scalars, splitFirstChunk)
+
+	case 11:
+		p.MsmC11(points, scalars, splitFirstChunk)
+
+	case 12:
+		p.MsmC12(points, scalars, splitFirstChunk)
+
+	case 13:
+		p.MsmC13(points, scalars, splitFirstChunk)
+
+	case 14:
+		p.MsmC14(points, scalars, splitFirstChunk)
+
+	case 15:
+		p.MsmC15(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.MsmC16(points, scalars, splitFirstChunk)
+
+	case 20:
+		p.MsmC20(points, scalars, splitFirstChunk)
+
+	case 21:
+		p.MsmC21(points, scalars, splitFirstChunk)
+
+	case 22:
+		p.MsmC22(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkE12 reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkE12(p *E12, c int, chChunks []chan E12) *E12 {
+	var _p E12
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.CyclotomicSquare(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.Mul(&_p, &totalj)
+	}
+
+	p.Set(&_p)
+	return p
+}
+
+func msmProcessChunkE12(chunk uint64,
+	chRes chan<- E12,
+	buckets []E12,
+	c uint64,
+	points []E12,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].SetOne()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		var tmp E12
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].Mul(&buckets[bits-1], &points[i])
+		} else {
+			// sub
+			tmp.Conjugate(&points[i])
+			buckets[bits & ^msbWindow].Mul(&buckets[bits & ^msbWindow], &tmp)
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var one, runningSum, total E12
+	runningSum.SetOne()
+	total.SetOne()
+	one.SetOne()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].Equal(&one) {
+			runningSum.Mul(&runningSum, &buckets[k])
+		}
+		total.Mul(&total, &runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *E12) MsmC4(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 4                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC5(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 5                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC6(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 6                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC7(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 7                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC8(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 8                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC9(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 9                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC10(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 10                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC11(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 11                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC12(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 12                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC13(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 13                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC14(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 14                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC15(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 15                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC16(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC20(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 20                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC21(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 21                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC22(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 22                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
diff --git a/ecc/bls12-378/pairing_test.go b/ecc/bls12-378/pairing_test.go
index 4567591fd7..1517f195cc 100644
--- a/ecc/bls12-378/pairing_test.go
+++ b/ecc/bls12-378/pairing_test.go
@@ -19,8 +19,11 @@ package bls12378
 import (
 	"fmt"
 	"math/big"
+	"math/bits"
+	"runtime"
 	"testing"
 
+	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-378/fp"
 	"github.com/consensys/gnark-crypto/ecc/bls12-378/fr"
 	"github.com/leanovate/gopter"
@@ -231,6 +234,145 @@ func TestPairing(t *testing.T) {
 
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
+func TestMultiExpGT(t *testing.T) {
+
+	parameters := gopter.DefaultTestParameters()
+	parameters.MinSuccessfulTests = 2
+
+	properties := gopter.NewProperties(parameters)
+
+	genScalar := GenFr()
+
+	// size of the multiExps
+	const nbSamples = 143
+
+	// multi exp points
+	var samplePoints [nbSamples]GT
+	var _g, g GT
+	_g.SetRandom()
+
+	// put into GT
+	_g = FinalExponentiation(&_g)
+
+	g.Set(&_g)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].Set(&g)
+		g.Mul(&g, &_g)
+	}
+
+	// final scalar to use in double and add method (without mixer factor)
+	// n(n+1)(2n+1)/6  (sum of the squares from 1 to n)
+	var scalar big.Int
+	scalar.SetInt64(nbSamples)
+	scalar.Mul(&scalar, new(big.Int).SetInt64(nbSamples+1))
+	scalar.Mul(&scalar, new(big.Int).SetInt64(2*nbSamples+1))
+	scalar.Div(&scalar, new(big.Int).SetInt64(6))
+
+	// ensure a multiexp that's splitted has the same result as a non-splitted one..
+	properties.Property("[GT] Multi exponentation (c=16) should be consistant with splitted multiexp", prop.ForAll(
+		func(mixer fr.Element) bool {
+			var samplePointsLarge [nbSamples * 13]GT
+			for i := 0; i < 13; i++ {
+				copy(samplePointsLarge[i*nbSamples:], samplePoints[:])
+			}
+
+			var r16, splitted1, splitted2 GT
+
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples * 13]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
+			r16.MsmC16(samplePoints[:], scalars16, true)
+
+			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
+			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
+			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
+		},
+		genScalar,
+	))
+
+	// we test only c = 5 and c = 16
+	properties.Property("[GT] Multi exponentation (c=5, c=16) should be consistant with sum of square", prop.ForAll(
+		func(mixer fr.Element) bool {
+
+			var expected, g GT
+			g.SetRandom()
+			// put into GT
+			g = FinalExponentiation(&_g)
+
+			// compute expected result with double and add
+			var finalScalar, mixerBigInt big.Int
+			finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt))
+			expected.ExpGLV(&_g, &finalScalar)
+
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			scalars5, _ := partitionScalars(sampleScalars[:], 5, false, runtime.NumCPU())
+			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
+
+			var r5, r16 GT
+			r5.MsmC5(samplePoints[:], scalars5, false)
+			r16.MsmC16(samplePoints[:], scalars16, true)
+			return (r5.Equal(&expected) && r16.Equal(&expected))
+		},
+		genScalar,
+	))
+
+	// note : this test is here as we expect to have a different multiExp than the above bucket method
+	// for small number of points
+	properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll(
+		func(mixer fr.Element) bool {
+
+			var _g, g GT
+			_g.SetRandom()
+
+			// put into GT
+			_g = FinalExponentiation(&_g)
+
+			g.Set(&_g)
+
+			// mixer ensures that all the words of a fpElement are set
+			samplePoints := make([]GT, 30)
+			sampleScalars := make([]fr.Element, 30)
+
+			for i := 1; i <= 30; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+				samplePoints[i-1].Set(&g)
+				g.Mul(&g, &_g)
+			}
+
+			var op1MultiExp GT
+			op1MultiExp.MultiExp(samplePoints, sampleScalars, ecc.MultiExpConfig{})
+
+			var finalBigScalar fr.Element
+			var finalBigScalarBi big.Int
+			var op1ScalarMul GT
+			finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer)
+			finalBigScalar.ToBigIntRegular(&finalBigScalarBi)
+			op1ScalarMul.ExpGLV(&_g, &finalBigScalarBi)
+
+			return op1ScalarMul.Equal(&op1MultiExp)
+		},
+		genScalar,
+	))
+
+	properties.TestingRun(t, gopter.ConsoleReporter(false))
+}
 
 // ------------------------------------------------------------
 // benches
@@ -362,3 +504,37 @@ func BenchmarkExpGT(b *testing.B) {
 		}
 	})
 }
+
+func BenchmarkMultiExpGT(b *testing.B) {
+	// ensure every words of the scalars are filled
+	var mixer fr.Element
+	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
+
+	const pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits
+	const nbSamples = 1 << pow
+
+	var samplePoints [nbSamples]GT
+	var sampleScalars [nbSamples]fr.Element
+	var _g GT
+	_g.SetRandom()
+
+	for i := 1; i <= nbSamples; i++ {
+		sampleScalars[i-1].SetUint64(uint64(i)).
+			Mul(&sampleScalars[i-1], &mixer).
+			FromMont()
+		samplePoints[i-1].Set(&_g)
+	}
+
+	var testPoint GT
+
+	for i := 5; i <= pow; i++ {
+		using := 1 << i
+
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
+	}
+}

From 6ceeb9083ff6c107fd36ddae86d8245042fef4e8 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Fri, 17 Jun 2022 17:36:47 +0100
Subject: [PATCH 09/16] fix: add GT-exp to bls24-317

---
 ecc/bls24-317/internal/fptower/e24.go        | 114 ++++++++++++++++++-
 ecc/bls24-317/internal/fptower/e24_test.go   |  25 ++++
 ecc/bls24-317/internal/fptower/parameters.go |  33 ++++++
 3 files changed, 166 insertions(+), 6 deletions(-)
 create mode 100644 ecc/bls24-317/internal/fptower/parameters.go

diff --git a/ecc/bls24-317/internal/fptower/e24.go b/ecc/bls24-317/internal/fptower/e24.go
index 9792420ca6..48384aa007 100644
--- a/ecc/bls24-317/internal/fptower/e24.go
+++ b/ecc/bls24-317/internal/fptower/e24.go
@@ -18,6 +18,8 @@ package fptower
 
 import (
 	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bls24-317/fr"
 	"math/big"
 )
 
@@ -405,25 +407,125 @@ func BatchInvertE24(a []E24) []E24 {
 }
 
 // Exp sets z=x**e and returns it
+// uses 2-bits windowed method
 func (z *E24) Exp(x *E24, e big.Int) *E24 {
+
 	var res E24
+	var ops [3]E24
+
 	res.SetOne()
+	ops[0].Set(x)
+	ops[1].Square(&ops[0])
+	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
+
 	b := e.Bytes()
 	for i := range b {
 		w := b[i]
-		mask := byte(0x80)
-		for j := 7; j >= 0; j-- {
-			res.Square(&res)
-			if (w&mask)>>j != 0 {
-				res.Mul(&res, x)
+		mask := byte(0xc0)
+		for j := 0; j < 4; j++ {
+			res.Square(&res).Square(&res)
+			c := (w & mask) >> (6 - 2*j)
+			if c != 0 {
+				res.Mul(&res, &ops[c-1])
 			}
-			mask = mask >> 1
+			mask = mask >> 2
+		}
+	}
+	z.Set(&res)
+
+	return z
+}
+
+// CyclotomicExp sets z=x**e and returns it
+// uses 2-NAF decomposition
+// x must be in the cyclotomic subgroup
+// TODO: use a windowed method
+func (z *E24) CyclotomicExp(x *E24, e big.Int) *E24 {
+	var res, xInv E24
+	xInv.InverseUnitary(x)
+	res.SetOne()
+	eNAF := make([]int8, e.BitLen()+3)
+	n := ecc.NafDecomposition(&e, eNAF[:])
+	for i := n - 1; i >= 0; i-- {
+		res.CyclotomicSquare(&res)
+		if eNAF[i] == 1 {
+			res.Mul(&res, x)
+		} else if eNAF[i] == -1 {
+			res.Mul(&res, &xInv)
 		}
 	}
 	z.Set(&res)
 	return z
 }
 
+// ExpGLV sets z=x**e and returns it
+// uses 2-dimensional GLV with 2-bits windowed method
+// x must be in GT
+// TODO: use 2-NAF
+// TODO: use higher dimensional decomposition
+func (p *E24) ExpGLV(a *E24, s *big.Int) *E24 {
+
+	var table [15]E24
+	var res E24
+	var k1, k2 fr.Element
+
+	res.SetOne()
+
+	// table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a
+	table[0].Set(a)
+	table[3].Frobenius(a)
+
+	// split the scalar, modifies +-x, Frob(x) accordingly
+	k := ecc.SplitScalar(s, &glvBasis)
+
+	if k[0].Sign() == -1 {
+		k[0].Neg(&k[0])
+		table[0].InverseUnitary(&table[0])
+	}
+	if k[1].Sign() == -1 {
+		k[1].Neg(&k[1])
+		table[3].InverseUnitary(&table[3])
+	}
+
+	// precompute table (2 bits sliding window)
+	// table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0
+	table[1].CyclotomicSquare(&table[0])
+	table[2].Mul(&table[1], &table[0])
+	table[4].Mul(&table[3], &table[0])
+	table[5].Mul(&table[3], &table[1])
+	table[6].Mul(&table[3], &table[2])
+	table[7].CyclotomicSquare(&table[3])
+	table[8].Mul(&table[7], &table[0])
+	table[9].Mul(&table[7], &table[1])
+	table[10].Mul(&table[7], &table[2])
+	table[11].Mul(&table[7], &table[3])
+	table[12].Mul(&table[11], &table[0])
+	table[13].Mul(&table[11], &table[1])
+	table[14].Mul(&table[11], &table[2])
+
+	// bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max
+	k1.SetBigInt(&k[0]).FromMont()
+	k2.SetBigInt(&k[1]).FromMont()
+
+	// loop starts from len(k1)/2 due to the bounds
+	for i := len(k1)/2 + 1; i >= 0; i-- {
+		mask := uint64(3) << 62
+		for j := 0; j < 32; j++ {
+			res.CyclotomicSquare(&res).CyclotomicSquare(&res)
+			b1 := (k1[i] & mask) >> (62 - 2*j)
+			b2 := (k2[i] & mask) >> (62 - 2*j)
+			if b1|b2 != 0 {
+				s := (b2<<2 | b1)
+				res.Mul(&res, &table[s-1])
+			}
+			mask = mask >> 2
+		}
+	}
+
+	p.Set(&res)
+	return p
+}
+
 // InverseUnitary inverse a unitary element
 func (z *E24) InverseUnitary(x *E24) *E24 {
 	return z.Conjugate(x)
diff --git a/ecc/bls24-317/internal/fptower/e24_test.go b/ecc/bls24-317/internal/fptower/e24_test.go
index b39bf90642..b5c2dec411 100644
--- a/ecc/bls24-317/internal/fptower/e24_test.go
+++ b/ecc/bls24-317/internal/fptower/e24_test.go
@@ -17,6 +17,7 @@
 package fptower
 
 import (
+	"math/big"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls24-317/fp"
@@ -192,6 +193,7 @@ func TestE24Ops(t *testing.T) {
 
 	genA := GenE24()
 	genB := GenE24()
+	genExp := GenFp()
 
 	properties.Property("[BLS24-317] sub & add should leave an element invariant", prop.ForAll(
 		func(a, b *E24) bool {
@@ -406,6 +408,29 @@ func TestE24Ops(t *testing.T) {
 		genA,
 	))
 
+	properties.Property("[BLS24-315] Exp and CyclotomicExp results must be the same in the cyclotomic subgroup", prop.ForAll(
+		func(a *E24, e fp.Element) bool {
+			var b, c, d E24
+			// put in the cyclo subgroup
+			b.Conjugate(a)
+			a.Inverse(a)
+			b.Mul(&b, a)
+			a.FrobeniusQuad(&b).Mul(a, &b)
+
+			var _e big.Int
+			k := new(big.Int).SetUint64(24)
+			e.Exp(e, k)
+			e.ToBigIntRegular(&_e)
+
+			c.Exp(a, _e)
+			d.CyclotomicExp(a, _e)
+
+			return c.Equal(&d)
+		},
+		genA,
+		genExp,
+	))
+
 	properties.Property("[BLS24-317] Frobenius of x in E24 should be equal to x^q", prop.ForAll(
 		func(a *E24) bool {
 			var b, c E24
diff --git a/ecc/bls24-317/internal/fptower/parameters.go b/ecc/bls24-317/internal/fptower/parameters.go
new file mode 100644
index 0000000000..6d637e8624
--- /dev/null
+++ b/ecc/bls24-317/internal/fptower/parameters.go
@@ -0,0 +1,33 @@
+// Copyright 2020 ConsenSys AG
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fptower
+
+import (
+	"math/big"
+
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bls24-317/fr"
+)
+
+// generator of the curve
+var xGen big.Int
+
+var glvBasis ecc.Lattice
+
+func init() {
+	xGen.SetString("3640754176", 10)
+	_r := fr.Modulus()
+	ecc.PrecomputeLattice(_r, &xGen, &glvBasis)
+}

From 2fbe149d279463d116423bdcb93b889cc3588874 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Fri, 17 Jun 2022 18:14:34 +0100
Subject: [PATCH 10/16] fix: handle negative exponent in Fp12 exp

---
 ecc/bls12-377/internal/fptower/e12.go         | 106 +++++++++++++----
 ecc/bls12-377/internal/fptower/e2.go          |  21 +++-
 ecc/bls12-377/pairing_test.go                 |   4 +-
 ecc/bls12-378/internal/fptower/e12.go         | 106 +++++++++++++----
 ecc/bls12-378/internal/fptower/e2.go          |  21 +++-
 ecc/bls12-378/pairing_test.go                 |   4 +-
 ecc/bls12-381/internal/fptower/e12.go         | 106 +++++++++++++----
 ecc/bls12-381/internal/fptower/e2.go          |  21 +++-
 ecc/bls12-381/pairing_test.go                 |   4 +-
 ecc/bls24-315/pairing_test.go                 |   4 +-
 ecc/bls24-317/pairing_test.go                 |   4 +-
 ecc/bn254/internal/fptower/e12.go             | 106 +++++++++++++----
 ecc/bn254/internal/fptower/e2.go              |  21 +++-
 ecc/bn254/pairing_test.go                     |   4 +-
 ecc/bw6-633/pairing_test.go                   |   4 +-
 ecc/bw6-756/pairing_test.go                   |   4 +-
 ecc/bw6-761/pairing_test.go                   |   4 +-
 .../pairing/template/tests/pairing.go.tmpl    |   4 +-
 .../template/fq12over6over2/fq12.go.tmpl      | 108 +++++++++++++-----
 .../tower/template/fq12over6over2/fq2.go.tmpl |  26 ++++-
 20 files changed, 523 insertions(+), 159 deletions(-)

diff --git a/ecc/bls12-377/internal/fptower/e12.go b/ecc/bls12-377/internal/fptower/e12.go
index 1d9325084e..2769e65952 100644
--- a/ecc/bls12-377/internal/fptower/e12.go
+++ b/ecc/bls12-377/internal/fptower/e12.go
@@ -23,8 +23,15 @@ import (
 	"github.com/consensys/gnark-crypto/ecc/bls12-377/fp"
 	"github.com/consensys/gnark-crypto/ecc/bls12-377/fr"
 	"math/big"
+	"sync"
 )
 
+var bigIntPool = sync.Pool{
+	New: func() interface{} {
+		return new(big.Int)
+	},
+}
+
 // E12 is a degree two finite field extension of fp6
 type E12 struct {
 	C0, C1 E6
@@ -408,9 +415,25 @@ func BatchInvertE12(a []E12) []E12 {
 	return res
 }
 
-// Exp sets z=x**e and returns it
+// Exp sets z=x**k and returns it
 // uses 2-bits windowed method
-func (z *E12) Exp(x *E12, e big.Int) *E12 {
+func (z *E12) Exp(x *E12, k big.Int) *E12 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
 
 	var res E12
 	var ops [3]E12
@@ -438,11 +461,28 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 {
 	return z
 }
 
-// CyclotomicExp sets z=x**e and returns it
+// CyclotomicExp sets z=x**k and returns it
 // uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
 // TODO: use a windowed method
-func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
+func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
+
 	var res, xInv E12
 	xInv.InverseUnitary(x)
 	res.SetOne()
@@ -460,37 +500,53 @@ func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
 	return z
 }
 
-// ExpGLV sets z=x**e and returns it
+// ExpGLV sets z=x**k and returns it
 // uses 2-dimensional GLV with 2-bits windowed method
 // x must be in GT
 // TODO: use 2-NAF
 // TODO: use higher dimensional decomposition
-func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 {
+func (z *E12) ExpGLV(x *E12, k big.Int) *E12 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
 
 	var table [15]E12
 	var res E12
-	var k1, k2 fr.Element
+	var s1, s2 fr.Element
 
 	res.SetOne()
 
-	// table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a
-	table[0].Set(a)
-	table[3].Frobenius(a)
+	// table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x
+	table[0].Set(x)
+	table[3].Frobenius(x)
 
 	// split the scalar, modifies +-x, Frob(x) accordingly
-	k := ecc.SplitScalar(s, &glvBasis)
+	s := ecc.SplitScalar(&e, &glvBasis)
 
-	if k[0].Sign() == -1 {
-		k[0].Neg(&k[0])
+	if s[0].Sign() == -1 {
+		s[0].Neg(&s[0])
 		table[0].InverseUnitary(&table[0])
 	}
-	if k[1].Sign() == -1 {
-		k[1].Neg(&k[1])
+	if s[1].Sign() == -1 {
+		s[1].Neg(&s[1])
 		table[3].InverseUnitary(&table[3])
 	}
 
 	// precompute table (2 bits sliding window)
-	// table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0
+	// table[b3b2b1b0-1] = b3b2*Frobenius(x) + b1b0*x if b3b2b1b0 != 0
 	table[1].CyclotomicSquare(&table[0])
 	table[2].Mul(&table[1], &table[0])
 	table[4].Mul(&table[3], &table[0])
@@ -505,17 +561,17 @@ func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 {
 	table[13].Mul(&table[11], &table[1])
 	table[14].Mul(&table[11], &table[2])
 
-	// bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max
-	k1.SetBigInt(&k[0]).FromMont()
-	k2.SetBigInt(&k[1]).FromMont()
+	// bounds on the lattice base vectors guarantee that s1, s2 are len(r)/2 bits long max
+	s1.SetBigInt(&s[0]).FromMont()
+	s2.SetBigInt(&s[1]).FromMont()
 
-	// loop starts from len(k1)/2 due to the bounds
-	for i := len(k1) / 2; i >= 0; i-- {
+	// loop starts from len(s1)/2 due to the bounds
+	for i := len(s1) / 2; i >= 0; i-- {
 		mask := uint64(3) << 62
 		for j := 0; j < 32; j++ {
 			res.CyclotomicSquare(&res).CyclotomicSquare(&res)
-			b1 := (k1[i] & mask) >> (62 - 2*j)
-			b2 := (k2[i] & mask) >> (62 - 2*j)
+			b1 := (s1[i] & mask) >> (62 - 2*j)
+			b2 := (s2[i] & mask) >> (62 - 2*j)
 			if b1|b2 != 0 {
 				s := (b2<<2 | b1)
 				res.Mul(&res, &table[s-1])
@@ -524,8 +580,8 @@ func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 {
 		}
 	}
 
-	p.Set(&res)
-	return p
+	z.Set(&res)
+	return z
 }
 
 // InverseUnitary inverse a unitary element
diff --git a/ecc/bls12-377/internal/fptower/e2.go b/ecc/bls12-377/internal/fptower/e2.go
index ee79f4c9d9..32e5ac4bf9 100644
--- a/ecc/bls12-377/internal/fptower/e2.go
+++ b/ecc/bls12-377/internal/fptower/e2.go
@@ -171,9 +171,26 @@ func (z *E2) Legendre() int {
 }
 
 // Exp sets z=x**e and returns it
-func (z *E2) Exp(x E2, exponent *big.Int) *E2 {
+func (z *E2) Exp(x E2, e *big.Int) *E2 {
+	if e.IsUint64() && e.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	k := e
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12)
+		x.Inverse(&x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		k = bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(k)
+		k.Neg(k)
+	}
+
 	z.SetOne()
-	b := exponent.Bytes()
+	b := e.Bytes()
 	for i := 0; i < len(b); i++ {
 		w := b[i]
 		for j := 0; j < 8; j++ {
diff --git a/ecc/bls12-377/pairing_test.go b/ecc/bls12-377/pairing_test.go
index be852d0280..e678c58613 100644
--- a/ecc/bls12-377/pairing_test.go
+++ b/ecc/bls12-377/pairing_test.go
@@ -77,7 +77,7 @@ func TestPairing(t *testing.T) {
 
 			var b, c, d GT
 			b.Exp(&a, _e)
-			c.ExpGLV(&a, &_e)
+			c.ExpGLV(&a, _e)
 			d.CyclotomicExp(&a, _e)
 
 			return b.Equal(&c) && c.Equal(&d)
@@ -407,7 +407,7 @@ func BenchmarkExpGT(b *testing.B) {
 	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.ExpGLV(&a, &_e)
+			a.ExpGLV(&a, _e)
 		}
 	})
 }
diff --git a/ecc/bls12-378/internal/fptower/e12.go b/ecc/bls12-378/internal/fptower/e12.go
index 14437de2b3..27f6285fdc 100644
--- a/ecc/bls12-378/internal/fptower/e12.go
+++ b/ecc/bls12-378/internal/fptower/e12.go
@@ -23,8 +23,15 @@ import (
 	"github.com/consensys/gnark-crypto/ecc/bls12-378/fp"
 	"github.com/consensys/gnark-crypto/ecc/bls12-378/fr"
 	"math/big"
+	"sync"
 )
 
+var bigIntPool = sync.Pool{
+	New: func() interface{} {
+		return new(big.Int)
+	},
+}
+
 // E12 is a degree two finite field extension of fp6
 type E12 struct {
 	C0, C1 E6
@@ -408,9 +415,25 @@ func BatchInvertE12(a []E12) []E12 {
 	return res
 }
 
-// Exp sets z=x**e and returns it
+// Exp sets z=x**k and returns it
 // uses 2-bits windowed method
-func (z *E12) Exp(x *E12, e big.Int) *E12 {
+func (z *E12) Exp(x *E12, k big.Int) *E12 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
 
 	var res E12
 	var ops [3]E12
@@ -438,11 +461,28 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 {
 	return z
 }
 
-// CyclotomicExp sets z=x**e and returns it
+// CyclotomicExp sets z=x**k and returns it
 // uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
 // TODO: use a windowed method
-func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
+func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
+
 	var res, xInv E12
 	xInv.InverseUnitary(x)
 	res.SetOne()
@@ -460,37 +500,53 @@ func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
 	return z
 }
 
-// ExpGLV sets z=x**e and returns it
+// ExpGLV sets z=x**k and returns it
 // uses 2-dimensional GLV with 2-bits windowed method
 // x must be in GT
 // TODO: use 2-NAF
 // TODO: use higher dimensional decomposition
-func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 {
+func (z *E12) ExpGLV(x *E12, k big.Int) *E12 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
 
 	var table [15]E12
 	var res E12
-	var k1, k2 fr.Element
+	var s1, s2 fr.Element
 
 	res.SetOne()
 
-	// table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a
-	table[0].Set(a)
-	table[3].Frobenius(a)
+	// table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x
+	table[0].Set(x)
+	table[3].Frobenius(x)
 
 	// split the scalar, modifies +-x, Frob(x) accordingly
-	k := ecc.SplitScalar(s, &glvBasis)
+	s := ecc.SplitScalar(&e, &glvBasis)
 
-	if k[0].Sign() == -1 {
-		k[0].Neg(&k[0])
+	if s[0].Sign() == -1 {
+		s[0].Neg(&s[0])
 		table[0].InverseUnitary(&table[0])
 	}
-	if k[1].Sign() == -1 {
-		k[1].Neg(&k[1])
+	if s[1].Sign() == -1 {
+		s[1].Neg(&s[1])
 		table[3].InverseUnitary(&table[3])
 	}
 
 	// precompute table (2 bits sliding window)
-	// table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0
+	// table[b3b2b1b0-1] = b3b2*Frobenius(x) + b1b0*x if b3b2b1b0 != 0
 	table[1].CyclotomicSquare(&table[0])
 	table[2].Mul(&table[1], &table[0])
 	table[4].Mul(&table[3], &table[0])
@@ -505,17 +561,17 @@ func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 {
 	table[13].Mul(&table[11], &table[1])
 	table[14].Mul(&table[11], &table[2])
 
-	// bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max
-	k1.SetBigInt(&k[0]).FromMont()
-	k2.SetBigInt(&k[1]).FromMont()
+	// bounds on the lattice base vectors guarantee that s1, s2 are len(r)/2 bits long max
+	s1.SetBigInt(&s[0]).FromMont()
+	s2.SetBigInt(&s[1]).FromMont()
 
-	// loop starts from len(k1)/2 due to the bounds
-	for i := len(k1) / 2; i >= 0; i-- {
+	// loop starts from len(s1)/2 due to the bounds
+	for i := len(s1) / 2; i >= 0; i-- {
 		mask := uint64(3) << 62
 		for j := 0; j < 32; j++ {
 			res.CyclotomicSquare(&res).CyclotomicSquare(&res)
-			b1 := (k1[i] & mask) >> (62 - 2*j)
-			b2 := (k2[i] & mask) >> (62 - 2*j)
+			b1 := (s1[i] & mask) >> (62 - 2*j)
+			b2 := (s2[i] & mask) >> (62 - 2*j)
 			if b1|b2 != 0 {
 				s := (b2<<2 | b1)
 				res.Mul(&res, &table[s-1])
@@ -524,8 +580,8 @@ func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 {
 		}
 	}
 
-	p.Set(&res)
-	return p
+	z.Set(&res)
+	return z
 }
 
 // InverseUnitary inverse a unitary element
diff --git a/ecc/bls12-378/internal/fptower/e2.go b/ecc/bls12-378/internal/fptower/e2.go
index 4ca5593160..0223d0aac8 100644
--- a/ecc/bls12-378/internal/fptower/e2.go
+++ b/ecc/bls12-378/internal/fptower/e2.go
@@ -171,9 +171,26 @@ func (z *E2) Legendre() int {
 }
 
 // Exp sets z=x**e and returns it
-func (z *E2) Exp(x E2, exponent *big.Int) *E2 {
+func (z *E2) Exp(x E2, e *big.Int) *E2 {
+	if e.IsUint64() && e.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	k := e
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12)
+		x.Inverse(&x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		k = bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(k)
+		k.Neg(k)
+	}
+
 	z.SetOne()
-	b := exponent.Bytes()
+	b := e.Bytes()
 	for i := 0; i < len(b); i++ {
 		w := b[i]
 		for j := 0; j < 8; j++ {
diff --git a/ecc/bls12-378/pairing_test.go b/ecc/bls12-378/pairing_test.go
index 4323a4dfac..c0a4e66eae 100644
--- a/ecc/bls12-378/pairing_test.go
+++ b/ecc/bls12-378/pairing_test.go
@@ -77,7 +77,7 @@ func TestPairing(t *testing.T) {
 
 			var b, c, d GT
 			b.Exp(&a, _e)
-			c.ExpGLV(&a, &_e)
+			c.ExpGLV(&a, _e)
 			d.CyclotomicExp(&a, _e)
 
 			return b.Equal(&c) && c.Equal(&d)
@@ -407,7 +407,7 @@ func BenchmarkExpGT(b *testing.B) {
 	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.ExpGLV(&a, &_e)
+			a.ExpGLV(&a, _e)
 		}
 	})
 }
diff --git a/ecc/bls12-381/internal/fptower/e12.go b/ecc/bls12-381/internal/fptower/e12.go
index 2ef60dbd25..7ba4aa36ec 100644
--- a/ecc/bls12-381/internal/fptower/e12.go
+++ b/ecc/bls12-381/internal/fptower/e12.go
@@ -23,8 +23,15 @@ import (
 	"github.com/consensys/gnark-crypto/ecc/bls12-381/fp"
 	"github.com/consensys/gnark-crypto/ecc/bls12-381/fr"
 	"math/big"
+	"sync"
 )
 
+var bigIntPool = sync.Pool{
+	New: func() interface{} {
+		return new(big.Int)
+	},
+}
+
 // E12 is a degree two finite field extension of fp6
 type E12 struct {
 	C0, C1 E6
@@ -408,9 +415,25 @@ func BatchInvertE12(a []E12) []E12 {
 	return res
 }
 
-// Exp sets z=x**e and returns it
+// Exp sets z=x**k and returns it
 // uses 2-bits windowed method
-func (z *E12) Exp(x *E12, e big.Int) *E12 {
+func (z *E12) Exp(x *E12, k big.Int) *E12 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
 
 	var res E12
 	var ops [3]E12
@@ -438,11 +461,28 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 {
 	return z
 }
 
-// CyclotomicExp sets z=x**e and returns it
+// CyclotomicExp sets z=x**k and returns it
 // uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
 // TODO: use a windowed method
-func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
+func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
+
 	var res, xInv E12
 	xInv.InverseUnitary(x)
 	res.SetOne()
@@ -460,37 +500,53 @@ func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
 	return z
 }
 
-// ExpGLV sets z=x**e and returns it
+// ExpGLV sets z=x**k and returns it
 // uses 2-dimensional GLV with 2-bits windowed method
 // x must be in GT
 // TODO: use 2-NAF
 // TODO: use higher dimensional decomposition
-func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 {
+func (z *E12) ExpGLV(x *E12, k big.Int) *E12 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
 
 	var table [15]E12
 	var res E12
-	var k1, k2 fr.Element
+	var s1, s2 fr.Element
 
 	res.SetOne()
 
-	// table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a
-	table[0].Set(a)
-	table[3].Frobenius(a)
+	// table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x
+	table[0].Set(x)
+	table[3].Frobenius(x)
 
 	// split the scalar, modifies +-x, Frob(x) accordingly
-	k := ecc.SplitScalar(s, &glvBasis)
+	s := ecc.SplitScalar(&e, &glvBasis)
 
-	if k[0].Sign() == -1 {
-		k[0].Neg(&k[0])
+	if s[0].Sign() == -1 {
+		s[0].Neg(&s[0])
 		table[0].InverseUnitary(&table[0])
 	}
-	if k[1].Sign() == -1 {
-		k[1].Neg(&k[1])
+	if s[1].Sign() == -1 {
+		s[1].Neg(&s[1])
 		table[3].InverseUnitary(&table[3])
 	}
 
 	// precompute table (2 bits sliding window)
-	// table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0
+	// table[b3b2b1b0-1] = b3b2*Frobenius(x) + b1b0*x if b3b2b1b0 != 0
 	table[1].CyclotomicSquare(&table[0])
 	table[2].Mul(&table[1], &table[0])
 	table[4].Mul(&table[3], &table[0])
@@ -505,17 +561,17 @@ func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 {
 	table[13].Mul(&table[11], &table[1])
 	table[14].Mul(&table[11], &table[2])
 
-	// bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max
-	k1.SetBigInt(&k[0]).FromMont()
-	k2.SetBigInt(&k[1]).FromMont()
+	// bounds on the lattice base vectors guarantee that s1, s2 are len(r)/2 bits long max
+	s1.SetBigInt(&s[0]).FromMont()
+	s2.SetBigInt(&s[1]).FromMont()
 
-	// loop starts from len(k1)/2 due to the bounds
-	for i := len(k1) / 2; i >= 0; i-- {
+	// loop starts from len(s1)/2 due to the bounds
+	for i := len(s1) / 2; i >= 0; i-- {
 		mask := uint64(3) << 62
 		for j := 0; j < 32; j++ {
 			res.CyclotomicSquare(&res).CyclotomicSquare(&res)
-			b1 := (k1[i] & mask) >> (62 - 2*j)
-			b2 := (k2[i] & mask) >> (62 - 2*j)
+			b1 := (s1[i] & mask) >> (62 - 2*j)
+			b2 := (s2[i] & mask) >> (62 - 2*j)
 			if b1|b2 != 0 {
 				s := (b2<<2 | b1)
 				res.Mul(&res, &table[s-1])
@@ -524,8 +580,8 @@ func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 {
 		}
 	}
 
-	p.Set(&res)
-	return p
+	z.Set(&res)
+	return z
 }
 
 // InverseUnitary inverse a unitary element
diff --git a/ecc/bls12-381/internal/fptower/e2.go b/ecc/bls12-381/internal/fptower/e2.go
index 20d6479a4b..a553e7fba5 100644
--- a/ecc/bls12-381/internal/fptower/e2.go
+++ b/ecc/bls12-381/internal/fptower/e2.go
@@ -171,9 +171,26 @@ func (z *E2) Legendre() int {
 }
 
 // Exp sets z=x**e and returns it
-func (z *E2) Exp(x E2, exponent *big.Int) *E2 {
+func (z *E2) Exp(x E2, e *big.Int) *E2 {
+	if e.IsUint64() && e.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	k := e
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12)
+		x.Inverse(&x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		k = bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(k)
+		k.Neg(k)
+	}
+
 	z.SetOne()
-	b := exponent.Bytes()
+	b := e.Bytes()
 	for i := 0; i < len(b); i++ {
 		w := b[i]
 		for j := 0; j < 8; j++ {
diff --git a/ecc/bls12-381/pairing_test.go b/ecc/bls12-381/pairing_test.go
index 003f841d31..4e7d4e4d7c 100644
--- a/ecc/bls12-381/pairing_test.go
+++ b/ecc/bls12-381/pairing_test.go
@@ -77,7 +77,7 @@ func TestPairing(t *testing.T) {
 
 			var b, c, d GT
 			b.Exp(&a, _e)
-			c.ExpGLV(&a, &_e)
+			c.ExpGLV(&a, _e)
 			d.CyclotomicExp(&a, _e)
 
 			return b.Equal(&c) && c.Equal(&d)
@@ -407,7 +407,7 @@ func BenchmarkExpGT(b *testing.B) {
 	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.ExpGLV(&a, &_e)
+			a.ExpGLV(&a, _e)
 		}
 	})
 }
diff --git a/ecc/bls24-315/pairing_test.go b/ecc/bls24-315/pairing_test.go
index 5100bca1b9..bc1d1383dc 100644
--- a/ecc/bls24-315/pairing_test.go
+++ b/ecc/bls24-315/pairing_test.go
@@ -79,7 +79,7 @@ func TestPairing(t *testing.T) {
 
 			var b, c, d GT
 			b.Exp(&a, _e)
-			c.ExpGLV(&a, &_e)
+			c.ExpGLV(&a, _e)
 			d.CyclotomicExp(&a, _e)
 
 			return b.Equal(&c) && c.Equal(&d)
@@ -410,7 +410,7 @@ func BenchmarkExpGT(b *testing.B) {
 	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.ExpGLV(&a, &_e)
+			a.ExpGLV(&a, _e)
 		}
 	})
 }
diff --git a/ecc/bls24-317/pairing_test.go b/ecc/bls24-317/pairing_test.go
index b0025e26af..b4a4595f81 100644
--- a/ecc/bls24-317/pairing_test.go
+++ b/ecc/bls24-317/pairing_test.go
@@ -78,7 +78,7 @@ func TestPairing(t *testing.T) {
 
 			var b, c, d GT
 			b.Exp(&a, _e)
-			c.ExpGLV(&a, &_e)
+			c.ExpGLV(&a, _e)
 			d.CyclotomicExp(&a, _e)
 
 			return b.Equal(&c) && c.Equal(&d)
@@ -408,7 +408,7 @@ func BenchmarkExpGT(b *testing.B) {
 	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.ExpGLV(&a, &_e)
+			a.ExpGLV(&a, _e)
 		}
 	})
 }
diff --git a/ecc/bn254/internal/fptower/e12.go b/ecc/bn254/internal/fptower/e12.go
index 59ca53395f..43f09f1f80 100644
--- a/ecc/bn254/internal/fptower/e12.go
+++ b/ecc/bn254/internal/fptower/e12.go
@@ -23,8 +23,15 @@ import (
 	"github.com/consensys/gnark-crypto/ecc/bn254/fp"
 	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
 	"math/big"
+	"sync"
 )
 
+var bigIntPool = sync.Pool{
+	New: func() interface{} {
+		return new(big.Int)
+	},
+}
+
 // E12 is a degree two finite field extension of fp6
 type E12 struct {
 	C0, C1 E6
@@ -408,9 +415,25 @@ func BatchInvertE12(a []E12) []E12 {
 	return res
 }
 
-// Exp sets z=x**e and returns it
+// Exp sets z=x**k and returns it
 // uses 2-bits windowed method
-func (z *E12) Exp(x *E12, e big.Int) *E12 {
+func (z *E12) Exp(x *E12, k big.Int) *E12 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
 
 	var res E12
 	var ops [3]E12
@@ -438,11 +461,28 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 {
 	return z
 }
 
-// CyclotomicExp sets z=x**e and returns it
+// CyclotomicExp sets z=x**k and returns it
 // uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
 // TODO: use a windowed method
-func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
+func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
+
 	var res, xInv E12
 	xInv.InverseUnitary(x)
 	res.SetOne()
@@ -460,37 +500,53 @@ func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
 	return z
 }
 
-// ExpGLV sets z=x**e and returns it
+// ExpGLV sets z=x**k and returns it
 // uses 2-dimensional GLV with 2-bits windowed method
 // x must be in GT
 // TODO: use 2-NAF
 // TODO: use higher dimensional decomposition
-func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 {
+func (z *E12) ExpGLV(x *E12, k big.Int) *E12 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
 
 	var table [15]E12
 	var res E12
-	var k1, k2 fr.Element
+	var s1, s2 fr.Element
 
 	res.SetOne()
 
-	// table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a
-	table[0].Set(a)
-	table[3].Frobenius(a)
+	// table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x
+	table[0].Set(x)
+	table[3].Frobenius(x)
 
 	// split the scalar, modifies +-x, Frob(x) accordingly
-	k := ecc.SplitScalar(s, &glvBasis)
+	s := ecc.SplitScalar(&e, &glvBasis)
 
-	if k[0].Sign() == -1 {
-		k[0].Neg(&k[0])
+	if s[0].Sign() == -1 {
+		s[0].Neg(&s[0])
 		table[0].InverseUnitary(&table[0])
 	}
-	if k[1].Sign() == -1 {
-		k[1].Neg(&k[1])
+	if s[1].Sign() == -1 {
+		s[1].Neg(&s[1])
 		table[3].InverseUnitary(&table[3])
 	}
 
 	// precompute table (2 bits sliding window)
-	// table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0
+	// table[b3b2b1b0-1] = b3b2*Frobenius(x) + b1b0*x if b3b2b1b0 != 0
 	table[1].CyclotomicSquare(&table[0])
 	table[2].Mul(&table[1], &table[0])
 	table[4].Mul(&table[3], &table[0])
@@ -505,17 +561,17 @@ func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 {
 	table[13].Mul(&table[11], &table[1])
 	table[14].Mul(&table[11], &table[2])
 
-	// bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max
-	k1.SetBigInt(&k[0]).FromMont()
-	k2.SetBigInt(&k[1]).FromMont()
+	// bounds on the lattice base vectors guarantee that s1, s2 are len(r)/2 bits long max
+	s1.SetBigInt(&s[0]).FromMont()
+	s2.SetBigInt(&s[1]).FromMont()
 
-	// loop starts from len(k1)/2 due to the bounds
-	for i := len(k1) / 2; i >= 0; i-- {
+	// loop starts from len(s1)/2 due to the bounds
+	for i := len(s1) / 2; i >= 0; i-- {
 		mask := uint64(3) << 62
 		for j := 0; j < 32; j++ {
 			res.CyclotomicSquare(&res).CyclotomicSquare(&res)
-			b1 := (k1[i] & mask) >> (62 - 2*j)
-			b2 := (k2[i] & mask) >> (62 - 2*j)
+			b1 := (s1[i] & mask) >> (62 - 2*j)
+			b2 := (s2[i] & mask) >> (62 - 2*j)
 			if b1|b2 != 0 {
 				s := (b2<<2 | b1)
 				res.Mul(&res, &table[s-1])
@@ -524,8 +580,8 @@ func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 {
 		}
 	}
 
-	p.Set(&res)
-	return p
+	z.Set(&res)
+	return z
 }
 
 // InverseUnitary inverse a unitary element
diff --git a/ecc/bn254/internal/fptower/e2.go b/ecc/bn254/internal/fptower/e2.go
index fe7e11343c..f6bec9b381 100644
--- a/ecc/bn254/internal/fptower/e2.go
+++ b/ecc/bn254/internal/fptower/e2.go
@@ -171,9 +171,26 @@ func (z *E2) Legendre() int {
 }
 
 // Exp sets z=x**e and returns it
-func (z *E2) Exp(x E2, exponent *big.Int) *E2 {
+func (z *E2) Exp(x E2, e *big.Int) *E2 {
+	if e.IsUint64() && e.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	k := e
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12)
+		x.Inverse(&x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		k = bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(k)
+		k.Neg(k)
+	}
+
 	z.SetOne()
-	b := exponent.Bytes()
+	b := e.Bytes()
 	for i := 0; i < len(b); i++ {
 		w := b[i]
 		for j := 0; j < 8; j++ {
diff --git a/ecc/bn254/pairing_test.go b/ecc/bn254/pairing_test.go
index 4fb2a98dc2..e342770160 100644
--- a/ecc/bn254/pairing_test.go
+++ b/ecc/bn254/pairing_test.go
@@ -77,7 +77,7 @@ func TestPairing(t *testing.T) {
 
 			var b, c, d GT
 			b.Exp(&a, _e)
-			c.ExpGLV(&a, &_e)
+			c.ExpGLV(&a, _e)
 			d.CyclotomicExp(&a, _e)
 
 			return b.Equal(&c) && c.Equal(&d)
@@ -407,7 +407,7 @@ func BenchmarkExpGT(b *testing.B) {
 	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.ExpGLV(&a, &_e)
+			a.ExpGLV(&a, _e)
 		}
 	})
 }
diff --git a/ecc/bw6-633/pairing_test.go b/ecc/bw6-633/pairing_test.go
index be5c03b17f..a8ed3ab2dc 100644
--- a/ecc/bw6-633/pairing_test.go
+++ b/ecc/bw6-633/pairing_test.go
@@ -79,7 +79,7 @@ func TestPairing(t *testing.T) {
 
 			var b, c, d GT
 			b.Exp(&a, _e)
-			c.ExpGLV(&a, &_e)
+			c.ExpGLV(&a, _e)
 			d.CyclotomicExp(&a, _e)
 
 			return b.Equal(&c) && c.Equal(&d)
@@ -410,7 +410,7 @@ func BenchmarkExpGT(b *testing.B) {
 	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.ExpGLV(&a, &_e)
+			a.ExpGLV(&a, _e)
 		}
 	})
 }
diff --git a/ecc/bw6-756/pairing_test.go b/ecc/bw6-756/pairing_test.go
index 7b7aaef22b..466a7c797d 100644
--- a/ecc/bw6-756/pairing_test.go
+++ b/ecc/bw6-756/pairing_test.go
@@ -78,7 +78,7 @@ func TestPairing(t *testing.T) {
 
 			var b, c, d GT
 			b.Exp(&a, _e)
-			c.ExpGLV(&a, &_e)
+			c.ExpGLV(&a, _e)
 			d.CyclotomicExp(&a, _e)
 
 			return b.Equal(&c) && c.Equal(&d)
@@ -408,7 +408,7 @@ func BenchmarkExpGT(b *testing.B) {
 	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.ExpGLV(&a, &_e)
+			a.ExpGLV(&a, _e)
 		}
 	})
 }
diff --git a/ecc/bw6-761/pairing_test.go b/ecc/bw6-761/pairing_test.go
index 5286484779..323deef40f 100644
--- a/ecc/bw6-761/pairing_test.go
+++ b/ecc/bw6-761/pairing_test.go
@@ -79,7 +79,7 @@ func TestPairing(t *testing.T) {
 
 			var b, c, d GT
 			b.Exp(&a, _e)
-			c.ExpGLV(&a, &_e)
+			c.ExpGLV(&a, _e)
 			d.CyclotomicExp(&a, _e)
 
 			return b.Equal(&c) && c.Equal(&d)
@@ -410,7 +410,7 @@ func BenchmarkExpGT(b *testing.B) {
 	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.ExpGLV(&a, &_e)
+			a.ExpGLV(&a, _e)
 		}
 	})
 }
diff --git a/internal/generator/pairing/template/tests/pairing.go.tmpl b/internal/generator/pairing/template/tests/pairing.go.tmpl
index c6bcbe6163..d77d7e45c8 100644
--- a/internal/generator/pairing/template/tests/pairing.go.tmpl
+++ b/internal/generator/pairing/template/tests/pairing.go.tmpl
@@ -70,7 +70,7 @@ func TestPairing(t *testing.T) {
 
 			var b, c, d GT
 			b.Exp(&a, _e)
-			c.ExpGLV(&a, &_e)
+			c.ExpGLV(&a, _e)
 			d.CyclotomicExp(&a, _e)
 
 			return b.Equal(&c) && c.Equal(&d)
@@ -415,7 +415,7 @@ func BenchmarkExpGT(b *testing.B) {
 	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.ExpGLV(&a, &_e)
+			a.ExpGLV(&a, _e)
 		}
 	})
 }
diff --git a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl
index a0908372e3..5975f193b3 100644
--- a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl
+++ b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl
@@ -2,11 +2,18 @@ import (
 	"math/big"
 	"encoding/binary"
 	"errors"
+    "sync"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/{{.Curve.Name}}/fp"
 	"github.com/consensys/gnark-crypto/ecc/{{.Curve.Name}}/fr"
 )
 
+var bigIntPool = sync.Pool{
+	New: func() interface{} {
+		return new(big.Int)
+	},
+}
+
 // E12 is a degree two finite field extension of fp6
 type E12 struct {
 	C0, C1 E6
@@ -391,9 +398,25 @@ func BatchInvertE12(a []E12) []E12 {
 	return res
 }
 
-// Exp sets z=x**e and returns it
+// Exp sets z=x**k and returns it
 // uses 2-bits windowed method
-func (z *E12) Exp(x *E12, e big.Int) *E12 {
+func (z *E12) Exp(x *E12, k big.Int) *E12 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
 
 	var res E12
 	var ops [3]E12
@@ -421,11 +444,28 @@ func (z *E12) Exp(x *E12, e big.Int) *E12 {
 	return z
 }
 
-// CyclotomicExp sets z=x**e and returns it
+// CyclotomicExp sets z=x**k and returns it
 // uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
 // TODO: use a windowed method
-func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
+func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
+
 	var res, xInv E12
 	xInv.InverseUnitary(x)
 	res.SetOne()
@@ -443,37 +483,53 @@ func (z *E12) CyclotomicExp(x *E12, e big.Int) *E12 {
 	return z
 }
 
-// ExpGLV sets z=x**e and returns it
+// ExpGLV sets z=x**k and returns it
 // uses 2-dimensional GLV with 2-bits windowed method
 // x must be in GT
 // TODO: use 2-NAF
 // TODO: use higher dimensional decomposition
-func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 {
+func (z *E12) ExpGLV(x *E12, k big.Int) *E12 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
 
-	var table [15]E12
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
+
+ 	var table [15]E12
 	var res E12
-	var k1, k2 fr.Element
+	var s1, s2 fr.Element
 
 	res.SetOne()
 
-	// table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a
-	table[0].Set(a)
-	table[3].Frobenius(a)
+	// table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x
+	table[0].Set(x)
+	table[3].Frobenius(x)
 
 	// split the scalar, modifies +-x, Frob(x) accordingly
-	k := ecc.SplitScalar(s, &glvBasis)
+	s := ecc.SplitScalar(&e, &glvBasis)
 
-	if k[0].Sign() == -1 {
-		k[0].Neg(&k[0])
+	if s[0].Sign() == -1 {
+		s[0].Neg(&s[0])
 		table[0].InverseUnitary(&table[0])
 	}
-	if k[1].Sign() == -1 {
-		k[1].Neg(&k[1])
+	if s[1].Sign() == -1 {
+		s[1].Neg(&s[1])
 		table[3].InverseUnitary(&table[3])
 	}
 
 	// precompute table (2 bits sliding window)
-	// table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0
+	// table[b3b2b1b0-1] = b3b2*Frobenius(x) + b1b0*x if b3b2b1b0 != 0
 	table[1].CyclotomicSquare(&table[0])
 	table[2].Mul(&table[1], &table[0])
 	table[4].Mul(&table[3], &table[0])
@@ -488,17 +544,17 @@ func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 {
 	table[13].Mul(&table[11], &table[1])
 	table[14].Mul(&table[11], &table[2])
 
-	// bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max
-	k1.SetBigInt(&k[0]).FromMont()
-	k2.SetBigInt(&k[1]).FromMont()
+	// bounds on the lattice base vectors guarantee that s1, s2 are len(r)/2 bits long max
+	s1.SetBigInt(&s[0]).FromMont()
+	s2.SetBigInt(&s[1]).FromMont()
 
-	// loop starts from len(k1)/2 due to the bounds
-	for i := len(k1) / 2; i >= 0; i-- {
+	// loop starts from len(s1)/2 due to the bounds
+	for i := len(s1) / 2; i >= 0; i-- {
 		mask := uint64(3) << 62
 		for j := 0; j < 32; j++ {
 			res.CyclotomicSquare(&res).CyclotomicSquare(&res)
-			b1 := (k1[i] & mask) >> (62 - 2*j)
-			b2 := (k2[i] & mask) >> (62 - 2*j)
+			b1 := (s1[i] & mask) >> (62 - 2*j)
+			b2 := (s2[i] & mask) >> (62 - 2*j)
 			if b1|b2 != 0 {
 				s := (b2<<2 | b1)
 				res.Mul(&res, &table[s-1])
@@ -507,8 +563,8 @@ func (p *E12) ExpGLV(a *E12, s *big.Int) *E12 {
 		}
 	}
 
-	p.Set(&res)
-	return p
+	z.Set(&res)
+	return z
 }
 
 // InverseUnitary inverse a unitary element
diff --git a/internal/generator/tower/template/fq12over6over2/fq2.go.tmpl b/internal/generator/tower/template/fq12over6over2/fq2.go.tmpl
index 9640e1ab77..35ed62c3e7 100644
--- a/internal/generator/tower/template/fq12over6over2/fq2.go.tmpl
+++ b/internal/generator/tower/template/fq12over6over2/fq2.go.tmpl
@@ -4,7 +4,6 @@ import (
 	"github.com/consensys/gnark-crypto/ecc/{{.Curve.Name}}/fp"
 )
 
-
 // E2 is a degree two finite field extension of fp.Element
 type E2 struct {
 	A0, A1 fp.Element
@@ -143,7 +142,7 @@ func (z *E2) Conjugate(x *E2) *E2 {
 	return z
 }
 
-// Halve sets z = z / 2 
+// Halve sets z = z / 2
 func (z *E2) Halve()  {
 	z.A0.Halve()
 	z.A1.Halve()
@@ -157,9 +156,26 @@ func (z *E2) Legendre() int {
 }
 
 // Exp sets z=x**e and returns it
-func (z *E2) Exp(x E2, exponent *big.Int) *E2 {
+func (z *E2) Exp(x E2, e *big.Int) *E2 {
+	if e.IsUint64() && e.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	k := e
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12)
+		x.Inverse(&x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		k = bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(k)
+		k.Neg(k)
+	}
+
 	z.SetOne()
-    b := exponent.Bytes()
+    b := e.Bytes()
     for i :=0;i<len(b); i++ {
 		w := b[i]
 		for j := 0; j < 8; j++ {
@@ -290,4 +306,4 @@ func BatchInvertE2(a []E2) []E2 {
 	return res
 }
 
-{{ template "base" .}}
\ No newline at end of file
+{{ template "base" .}}

From f848c12a087b73412e6e4721bae5cf48cb7ad6c3 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Sat, 18 Jun 2022 16:28:55 +0100
Subject: [PATCH 11/16] fix: handle negative exponent in Fp24 and Fp6 exp

---
 ecc/bls24-315/internal/fptower/e2.go  |  21 ++++-
 ecc/bls24-315/internal/fptower/e24.go | 106 ++++++++++++++++++++------
 ecc/bls24-315/internal/fptower/e4.go  |  17 +++++
 ecc/bls24-317/internal/fptower/e2.go  |  21 ++++-
 ecc/bls24-317/internal/fptower/e24.go | 106 ++++++++++++++++++++------
 ecc/bls24-317/internal/fptower/e4.go  |  17 +++++
 ecc/bw6-633/internal/fptower/e6.go    | 106 ++++++++++++++++++++------
 ecc/bw6-756/internal/fptower/e6.go    | 106 ++++++++++++++++++++------
 ecc/bw6-761/internal/fptower/e6.go    | 106 ++++++++++++++++++++------
 9 files changed, 477 insertions(+), 129 deletions(-)

diff --git a/ecc/bls24-315/internal/fptower/e2.go b/ecc/bls24-315/internal/fptower/e2.go
index e026fee0e0..774c4c1d48 100644
--- a/ecc/bls24-315/internal/fptower/e2.go
+++ b/ecc/bls24-315/internal/fptower/e2.go
@@ -164,9 +164,26 @@ func (z *E2) Legendre() int {
 }
 
 // Exp sets z=x**e and returns it
-func (z *E2) Exp(x E2, exponent *big.Int) *E2 {
+func (z *E2) Exp(x E2, e *big.Int) *E2 {
+	if e.IsUint64() && e.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	k := e
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12)
+		x.Inverse(&x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		k = bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(k)
+		k.Neg(k)
+	}
+
 	z.SetOne()
-	b := exponent.Bytes()
+	b := e.Bytes()
 	for i := 0; i < len(b); i++ {
 		w := b[i]
 		for j := 0; j < 8; j++ {
diff --git a/ecc/bls24-315/internal/fptower/e24.go b/ecc/bls24-315/internal/fptower/e24.go
index bedfba214c..a935af6e11 100644
--- a/ecc/bls24-315/internal/fptower/e24.go
+++ b/ecc/bls24-315/internal/fptower/e24.go
@@ -21,8 +21,15 @@ import (
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls24-315/fr"
 	"math/big"
+	"sync"
 )
 
+var bigIntPool = sync.Pool{
+	New: func() interface{} {
+		return new(big.Int)
+	},
+}
+
 // E24 is a degree two finite field extension of fp6
 type E24 struct {
 	D0, D1 E12
@@ -406,9 +413,25 @@ func BatchInvertE24(a []E24) []E24 {
 	return res
 }
 
-// Exp sets z=x**e and returns it
+// Exp sets z=x**k and returns it
 // uses 2-bits windowed method
-func (z *E24) Exp(x *E24, e big.Int) *E24 {
+func (z *E24) Exp(x *E24, k big.Int) *E24 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
 
 	var res E24
 	var ops [3]E24
@@ -436,11 +459,28 @@ func (z *E24) Exp(x *E24, e big.Int) *E24 {
 	return z
 }
 
-// CyclotomicExp sets z=x**e and returns it
+// CyclotomicExp sets z=x**k and returns it
 // uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
 // TODO: use a windowed method
-func (z *E24) CyclotomicExp(x *E24, e big.Int) *E24 {
+func (z *E24) CyclotomicExp(x *E24, k big.Int) *E24 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
+
 	var res, xInv E24
 	xInv.InverseUnitary(x)
 	res.SetOne()
@@ -458,37 +498,53 @@ func (z *E24) CyclotomicExp(x *E24, e big.Int) *E24 {
 	return z
 }
 
-// ExpGLV sets z=x**e and returns it
+// ExpGLV sets z=x**k and returns it
 // uses 2-dimensional GLV with 2-bits windowed method
 // x must be in GT
 // TODO: use 2-NAF
 // TODO: use higher dimensional decomposition
-func (p *E24) ExpGLV(a *E24, s *big.Int) *E24 {
+func (z *E24) ExpGLV(x *E24, k big.Int) *E24 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
 
 	var table [15]E24
 	var res E24
-	var k1, k2 fr.Element
+	var s1, s2 fr.Element
 
 	res.SetOne()
 
-	// table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a
-	table[0].Set(a)
-	table[3].Frobenius(a)
+	// table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x
+	table[0].Set(x)
+	table[3].Frobenius(x)
 
 	// split the scalar, modifies +-x, Frob(x) accordingly
-	k := ecc.SplitScalar(s, &glvBasis)
+	s := ecc.SplitScalar(&e, &glvBasis)
 
-	if k[0].Sign() == -1 {
-		k[0].Neg(&k[0])
+	if s[0].Sign() == -1 {
+		s[0].Neg(&s[0])
 		table[0].InverseUnitary(&table[0])
 	}
-	if k[1].Sign() == -1 {
-		k[1].Neg(&k[1])
+	if s[1].Sign() == -1 {
+		s[1].Neg(&s[1])
 		table[3].InverseUnitary(&table[3])
 	}
 
 	// precompute table (2 bits sliding window)
-	// table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0
+	// table[b3b2b1b0-1] = b3b2*Frobenius(x) + b1b0*x if b3b2b1b0 != 0
 	table[1].CyclotomicSquare(&table[0])
 	table[2].Mul(&table[1], &table[0])
 	table[4].Mul(&table[3], &table[0])
@@ -503,17 +559,17 @@ func (p *E24) ExpGLV(a *E24, s *big.Int) *E24 {
 	table[13].Mul(&table[11], &table[1])
 	table[14].Mul(&table[11], &table[2])
 
-	// bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max
-	k1.SetBigInt(&k[0]).FromMont()
-	k2.SetBigInt(&k[1]).FromMont()
+	// bounds on the lattice base vectors guarantee that s1, s2 are len(r)/2 bits long max
+	s1.SetBigInt(&s[0]).FromMont()
+	s2.SetBigInt(&s[1]).FromMont()
 
-	// loop starts from len(k1)/2 due to the bounds
-	for i := len(k1)/2 + 1; i >= 0; i-- {
+	// loop starts from len(s1)/2 due to the bounds
+	for i := len(s1)/2 + 1; i >= 0; i-- {
 		mask := uint64(3) << 62
 		for j := 0; j < 32; j++ {
 			res.CyclotomicSquare(&res).CyclotomicSquare(&res)
-			b1 := (k1[i] & mask) >> (62 - 2*j)
-			b2 := (k2[i] & mask) >> (62 - 2*j)
+			b1 := (s1[i] & mask) >> (62 - 2*j)
+			b2 := (s2[i] & mask) >> (62 - 2*j)
 			if b1|b2 != 0 {
 				s := (b2<<2 | b1)
 				res.Mul(&res, &table[s-1])
@@ -522,8 +578,8 @@ func (p *E24) ExpGLV(a *E24, s *big.Int) *E24 {
 		}
 	}
 
-	p.Set(&res)
-	return p
+	z.Set(&res)
+	return z
 }
 
 // InverseUnitary inverse a unitary element
diff --git a/ecc/bls24-315/internal/fptower/e4.go b/ecc/bls24-315/internal/fptower/e4.go
index 830f988691..d73f787dd1 100644
--- a/ecc/bls24-315/internal/fptower/e4.go
+++ b/ecc/bls24-315/internal/fptower/e4.go
@@ -217,6 +217,23 @@ func (z *E4) Inverse(x *E4) *E4 {
 
 // Exp sets z=x**e and returns it
 func (z *E4) Exp(x *E4, e big.Int) *E4 {
+	if e.IsUint64() && e.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	k := e
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		k = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(k)
+		k.Neg(&k)
+	}
+
 	var res E4
 	res.SetOne()
 	b := e.Bytes()
diff --git a/ecc/bls24-317/internal/fptower/e2.go b/ecc/bls24-317/internal/fptower/e2.go
index 25d035ea80..f5d018924f 100644
--- a/ecc/bls24-317/internal/fptower/e2.go
+++ b/ecc/bls24-317/internal/fptower/e2.go
@@ -163,9 +163,26 @@ func (z *E2) Legendre() int {
 }
 
 // Exp sets z=x**e and returns it
-func (z *E2) Exp(x E2, exponent *big.Int) *E2 {
+func (z *E2) Exp(x E2, e *big.Int) *E2 {
+	if e.IsUint64() && e.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	k := e
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12)
+		x.Inverse(&x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		k = bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(k)
+		k.Neg(k)
+	}
+
 	z.SetOne()
-	b := exponent.Bytes()
+	b := e.Bytes()
 	for i := 0; i < len(b); i++ {
 		w := b[i]
 		for j := 0; j < 8; j++ {
diff --git a/ecc/bls24-317/internal/fptower/e24.go b/ecc/bls24-317/internal/fptower/e24.go
index 48384aa007..f25d66539d 100644
--- a/ecc/bls24-317/internal/fptower/e24.go
+++ b/ecc/bls24-317/internal/fptower/e24.go
@@ -21,8 +21,15 @@ import (
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls24-317/fr"
 	"math/big"
+	"sync"
 )
 
+var bigIntPool = sync.Pool{
+	New: func() interface{} {
+		return new(big.Int)
+	},
+}
+
 // E24 is a degree two finite field extension of fp6
 type E24 struct {
 	D0, D1 E12
@@ -406,9 +413,25 @@ func BatchInvertE24(a []E24) []E24 {
 	return res
 }
 
-// Exp sets z=x**e and returns it
+// Exp sets z=x**k and returns it
 // uses 2-bits windowed method
-func (z *E24) Exp(x *E24, e big.Int) *E24 {
+func (z *E24) Exp(x *E24, k big.Int) *E24 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
 
 	var res E24
 	var ops [3]E24
@@ -436,11 +459,28 @@ func (z *E24) Exp(x *E24, e big.Int) *E24 {
 	return z
 }
 
-// CyclotomicExp sets z=x**e and returns it
+// CyclotomicExp sets z=x**k and returns it
 // uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
 // TODO: use a windowed method
-func (z *E24) CyclotomicExp(x *E24, e big.Int) *E24 {
+func (z *E24) CyclotomicExp(x *E24, k big.Int) *E24 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
+
 	var res, xInv E24
 	xInv.InverseUnitary(x)
 	res.SetOne()
@@ -458,37 +498,53 @@ func (z *E24) CyclotomicExp(x *E24, e big.Int) *E24 {
 	return z
 }
 
-// ExpGLV sets z=x**e and returns it
+// ExpGLV sets z=x**k and returns it
 // uses 2-dimensional GLV with 2-bits windowed method
 // x must be in GT
 // TODO: use 2-NAF
 // TODO: use higher dimensional decomposition
-func (p *E24) ExpGLV(a *E24, s *big.Int) *E24 {
+func (z *E24) ExpGLV(x *E24, k big.Int) *E24 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
 
 	var table [15]E24
 	var res E24
-	var k1, k2 fr.Element
+	var s1, s2 fr.Element
 
 	res.SetOne()
 
-	// table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a
-	table[0].Set(a)
-	table[3].Frobenius(a)
+	// table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x
+	table[0].Set(x)
+	table[3].Frobenius(x)
 
 	// split the scalar, modifies +-x, Frob(x) accordingly
-	k := ecc.SplitScalar(s, &glvBasis)
+	s := ecc.SplitScalar(&e, &glvBasis)
 
-	if k[0].Sign() == -1 {
-		k[0].Neg(&k[0])
+	if s[0].Sign() == -1 {
+		s[0].Neg(&s[0])
 		table[0].InverseUnitary(&table[0])
 	}
-	if k[1].Sign() == -1 {
-		k[1].Neg(&k[1])
+	if s[1].Sign() == -1 {
+		s[1].Neg(&s[1])
 		table[3].InverseUnitary(&table[3])
 	}
 
 	// precompute table (2 bits sliding window)
-	// table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0
+	// table[b3b2b1b0-1] = b3b2*Frobenius(x) + b1b0*x if b3b2b1b0 != 0
 	table[1].CyclotomicSquare(&table[0])
 	table[2].Mul(&table[1], &table[0])
 	table[4].Mul(&table[3], &table[0])
@@ -503,17 +559,17 @@ func (p *E24) ExpGLV(a *E24, s *big.Int) *E24 {
 	table[13].Mul(&table[11], &table[1])
 	table[14].Mul(&table[11], &table[2])
 
-	// bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max
-	k1.SetBigInt(&k[0]).FromMont()
-	k2.SetBigInt(&k[1]).FromMont()
+	// bounds on the lattice base vectors guarantee that s1, s2 are len(r)/2 bits long max
+	s1.SetBigInt(&s[0]).FromMont()
+	s2.SetBigInt(&s[1]).FromMont()
 
-	// loop starts from len(k1)/2 due to the bounds
-	for i := len(k1)/2 + 1; i >= 0; i-- {
+	// loop starts from len(s1)/2 due to the bounds
+	for i := len(s1)/2 + 1; i >= 0; i-- {
 		mask := uint64(3) << 62
 		for j := 0; j < 32; j++ {
 			res.CyclotomicSquare(&res).CyclotomicSquare(&res)
-			b1 := (k1[i] & mask) >> (62 - 2*j)
-			b2 := (k2[i] & mask) >> (62 - 2*j)
+			b1 := (s1[i] & mask) >> (62 - 2*j)
+			b2 := (s2[i] & mask) >> (62 - 2*j)
 			if b1|b2 != 0 {
 				s := (b2<<2 | b1)
 				res.Mul(&res, &table[s-1])
@@ -522,8 +578,8 @@ func (p *E24) ExpGLV(a *E24, s *big.Int) *E24 {
 		}
 	}
 
-	p.Set(&res)
-	return p
+	z.Set(&res)
+	return z
 }
 
 // InverseUnitary inverse a unitary element
diff --git a/ecc/bls24-317/internal/fptower/e4.go b/ecc/bls24-317/internal/fptower/e4.go
index 2c84e6d1c3..bed1311c0d 100644
--- a/ecc/bls24-317/internal/fptower/e4.go
+++ b/ecc/bls24-317/internal/fptower/e4.go
@@ -218,6 +218,23 @@ func (z *E4) Inverse(x *E4) *E4 {
 
 // Exp sets z=x**e and returns it
 func (z *E4) Exp(x *E4, e big.Int) *E4 {
+	if e.IsUint64() && e.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	k := e
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		k = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(k)
+		k.Neg(&k)
+	}
+
 	var res E4
 	res.SetOne()
 	b := e.Bytes()
diff --git a/ecc/bw6-633/internal/fptower/e6.go b/ecc/bw6-633/internal/fptower/e6.go
index 71a59cf64d..fdc1afa39c 100644
--- a/ecc/bw6-633/internal/fptower/e6.go
+++ b/ecc/bw6-633/internal/fptower/e6.go
@@ -19,12 +19,19 @@ package fptower
 import (
 	"errors"
 	"math/big"
+	"sync"
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fp"
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fr"
 )
 
+var bigIntPool = sync.Pool{
+	New: func() interface{} {
+		return new(big.Int)
+	},
+}
+
 // E6 is a degree two finite field extension of fp3
 type E6 struct {
 	B0, B1 E3
@@ -351,9 +358,25 @@ func BatchInvertE6(a []E6) []E6 {
 	return res
 }
 
-// Exp sets z=x**e and returns it
+// Exp sets z=x**k and returns it
 // uses 2-bits windowed method
-func (z *E6) Exp(x *E6, e big.Int) *E6 {
+func (z *E6) Exp(x *E6, k big.Int) *E6 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
 
 	var res E6
 	var ops [3]E6
@@ -381,11 +404,28 @@ func (z *E6) Exp(x *E6, e big.Int) *E6 {
 	return z
 }
 
-// CyclotomicExp sets z=x**e and returns it
+// CyclotomicExp sets z=x**k and returns it
 // uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
 // TODO: use a windowed method
-func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 {
+func (z *E6) CyclotomicExp(x *E6, k big.Int) *E6 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
+
 	var res, xInv E6
 	xInv.InverseUnitary(x)
 	res.SetOne()
@@ -403,37 +443,53 @@ func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 {
 	return z
 }
 
-// ExpGLV sets z=x**e and returns it
+// ExpGLV sets z=x**k and returns it
 // uses 2-dimensional GLV with 2-bits windowed method
 // x must be in GT
 // TODO: use 2-NAF
 // TODO: use higher dimensional decomposition
-func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 {
+func (z *E6) ExpGLV(x *E6, k big.Int) *E6 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
 
 	var table [15]E6
 	var res E6
-	var k1, k2 fr.Element
+	var s1, s2 fr.Element
 
 	res.SetOne()
 
-	// table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a
-	table[0].Set(a)
-	table[3].Frobenius(a)
+	// table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x
+	table[0].Set(x)
+	table[3].Frobenius(x)
 
 	// split the scalar, modifies +-x, Frob(x) accordingly
-	k := ecc.SplitScalar(s, &glvBasis)
+	s := ecc.SplitScalar(&e, &glvBasis)
 
-	if k[0].Sign() == -1 {
-		k[0].Neg(&k[0])
+	if s[0].Sign() == -1 {
+		s[0].Neg(&s[0])
 		table[0].InverseUnitary(&table[0])
 	}
-	if k[1].Sign() == -1 {
-		k[1].Neg(&k[1])
+	if s[1].Sign() == -1 {
+		s[1].Neg(&s[1])
 		table[3].InverseUnitary(&table[3])
 	}
 
 	// precompute table (2 bits sliding window)
-	// table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0
+	// table[b3b2b1b0-1] = b3b2*Frobenius(x) + b1b0*x if b3b2b1b0 != 0
 	table[1].CyclotomicSquare(&table[0])
 	table[2].Mul(&table[1], &table[0])
 	table[4].Mul(&table[3], &table[0])
@@ -448,17 +504,17 @@ func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 {
 	table[13].Mul(&table[11], &table[1])
 	table[14].Mul(&table[11], &table[2])
 
-	// bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max
-	k1.SetBigInt(&k[0]).FromMont()
-	k2.SetBigInt(&k[1]).FromMont()
+	// bounds on the lattice base vectors guarantee that s1, s2 are len(r)/2 bits long max
+	s1.SetBigInt(&s[0]).FromMont()
+	s2.SetBigInt(&s[1]).FromMont()
 
-	// loop starts from len(k1)/2 due to the bounds
-	for i := len(k1) / 2; i >= 0; i-- {
+	// loop starts from len(s1)/2 due to the bounds
+	for i := len(s1) / 2; i >= 0; i-- {
 		mask := uint64(3) << 62
 		for j := 0; j < 32; j++ {
 			res.CyclotomicSquare(&res).CyclotomicSquare(&res)
-			b1 := (k1[i] & mask) >> (62 - 2*j)
-			b2 := (k2[i] & mask) >> (62 - 2*j)
+			b1 := (s1[i] & mask) >> (62 - 2*j)
+			b2 := (s2[i] & mask) >> (62 - 2*j)
 			if b1|b2 != 0 {
 				s := (b2<<2 | b1)
 				res.Mul(&res, &table[s-1])
@@ -467,8 +523,8 @@ func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 {
 		}
 	}
 
-	p.Set(&res)
-	return p
+	z.Set(&res)
+	return z
 }
 
 // InverseUnitary inverse a unitary element
diff --git a/ecc/bw6-756/internal/fptower/e6.go b/ecc/bw6-756/internal/fptower/e6.go
index e83a3deabe..16464db219 100644
--- a/ecc/bw6-756/internal/fptower/e6.go
+++ b/ecc/bw6-756/internal/fptower/e6.go
@@ -19,12 +19,19 @@ package fptower
 import (
 	"errors"
 	"math/big"
+	"sync"
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-756/fp"
 	"github.com/consensys/gnark-crypto/ecc/bw6-756/fr"
 )
 
+var bigIntPool = sync.Pool{
+	New: func() interface{} {
+		return new(big.Int)
+	},
+}
+
 // E6 is a degree two finite field extension of fp3
 type E6 struct {
 	B0, B1 E3
@@ -350,9 +357,25 @@ func BatchInvertE6(a []E6) []E6 {
 	return res
 }
 
-// Exp sets z=x**e and returns it
+// Exp sets z=x**k and returns it
 // uses 2-bits windowed method
-func (z *E6) Exp(x *E6, e big.Int) *E6 {
+func (z *E6) Exp(x *E6, k big.Int) *E6 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
 
 	var res E6
 	var ops [3]E6
@@ -380,11 +403,28 @@ func (z *E6) Exp(x *E6, e big.Int) *E6 {
 	return z
 }
 
-// CyclotomicExp sets z=x**e and returns it
+// CyclotomicExp sets z=x**k and returns it
 // uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
 // TODO: use a windowed method
-func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 {
+func (z *E6) CyclotomicExp(x *E6, k big.Int) *E6 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
+
 	var res, xInv E6
 	xInv.InverseUnitary(x)
 	res.SetOne()
@@ -402,37 +442,53 @@ func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 {
 	return z
 }
 
-// ExpGLV sets z=x**e and returns it
+// ExpGLV sets z=x**k and returns it
 // uses 2-dimensional GLV with 2-bits windowed method
 // x must be in GT
 // TODO: use 2-NAF
 // TODO: use higher dimensional decomposition
-func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 {
+func (z *E6) ExpGLV(x *E6, k big.Int) *E6 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
 
 	var table [15]E6
 	var res E6
-	var k1, k2 fr.Element
+	var s1, s2 fr.Element
 
 	res.SetOne()
 
-	// table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a
-	table[0].Set(a)
-	table[3].Frobenius(a)
+	// table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x
+	table[0].Set(x)
+	table[3].Frobenius(x)
 
 	// split the scalar, modifies +-x, Frob(x) accordingly
-	k := ecc.SplitScalar(s, &glvBasis)
+	s := ecc.SplitScalar(&e, &glvBasis)
 
-	if k[0].Sign() == -1 {
-		k[0].Neg(&k[0])
+	if s[0].Sign() == -1 {
+		s[0].Neg(&s[0])
 		table[0].InverseUnitary(&table[0])
 	}
-	if k[1].Sign() == -1 {
-		k[1].Neg(&k[1])
+	if s[1].Sign() == -1 {
+		s[1].Neg(&s[1])
 		table[3].InverseUnitary(&table[3])
 	}
 
 	// precompute table (2 bits sliding window)
-	// table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0
+	// table[b3b2b1b0-1] = b3b2*Frobenius(x) + b1b0*x if b3b2b1b0 != 0
 	table[1].CyclotomicSquare(&table[0])
 	table[2].Mul(&table[1], &table[0])
 	table[4].Mul(&table[3], &table[0])
@@ -447,17 +503,17 @@ func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 {
 	table[13].Mul(&table[11], &table[1])
 	table[14].Mul(&table[11], &table[2])
 
-	// bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max
-	k1.SetBigInt(&k[0]).FromMont()
-	k2.SetBigInt(&k[1]).FromMont()
+	// bounds on the lattice base vectors guarantee that s1, s2 are len(r)/2 bits long max
+	s1.SetBigInt(&s[0]).FromMont()
+	s2.SetBigInt(&s[1]).FromMont()
 
-	// loop starts from len(k1)/2 due to the bounds
-	for i := len(k1) / 2; i >= 0; i-- {
+	// loop starts from len(s1)/2 due to the bounds
+	for i := len(s1) / 2; i >= 0; i-- {
 		mask := uint64(3) << 62
 		for j := 0; j < 32; j++ {
 			res.CyclotomicSquare(&res).CyclotomicSquare(&res)
-			b1 := (k1[i] & mask) >> (62 - 2*j)
-			b2 := (k2[i] & mask) >> (62 - 2*j)
+			b1 := (s1[i] & mask) >> (62 - 2*j)
+			b2 := (s2[i] & mask) >> (62 - 2*j)
 			if b1|b2 != 0 {
 				s := (b2<<2 | b1)
 				res.Mul(&res, &table[s-1])
@@ -466,8 +522,8 @@ func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 {
 		}
 	}
 
-	p.Set(&res)
-	return p
+	z.Set(&res)
+	return z
 }
 
 // InverseUnitary inverse a unitary element
diff --git a/ecc/bw6-761/internal/fptower/e6.go b/ecc/bw6-761/internal/fptower/e6.go
index 87cee347f2..0f914d18d0 100644
--- a/ecc/bw6-761/internal/fptower/e6.go
+++ b/ecc/bw6-761/internal/fptower/e6.go
@@ -19,12 +19,19 @@ package fptower
 import (
 	"errors"
 	"math/big"
+	"sync"
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fp"
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fr"
 )
 
+var bigIntPool = sync.Pool{
+	New: func() interface{} {
+		return new(big.Int)
+	},
+}
+
 // E6 is a degree two finite field extension of fp3
 type E6 struct {
 	B0, B1 E3
@@ -350,9 +357,25 @@ func BatchInvertE6(a []E6) []E6 {
 	return res
 }
 
-// Exp sets z=x**e and returns it
+// Exp sets z=x**k and returns it
 // uses 2-bits windowed method
-func (z *E6) Exp(x *E6, e big.Int) *E6 {
+func (z *E6) Exp(x *E6, k big.Int) *E6 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
 
 	var res E6
 	var ops [3]E6
@@ -380,11 +403,28 @@ func (z *E6) Exp(x *E6, e big.Int) *E6 {
 	return z
 }
 
-// CyclotomicExp sets z=x**e and returns it
+// CyclotomicExp sets z=x**k and returns it
 // uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
 // TODO: use a windowed method
-func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 {
+func (z *E6) CyclotomicExp(x *E6, k big.Int) *E6 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
+
 	var res, xInv E6
 	xInv.InverseUnitary(x)
 	res.SetOne()
@@ -402,37 +442,53 @@ func (z *E6) CyclotomicExp(x *E6, e big.Int) *E6 {
 	return z
 }
 
-// ExpGLV sets z=x**e and returns it
+// ExpGLV sets z=x**k and returns it
 // uses 2-dimensional GLV with 2-bits windowed method
 // x must be in GT
 // TODO: use 2-NAF
 // TODO: use higher dimensional decomposition
-func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 {
+func (z *E6) ExpGLV(x *E6, k big.Int) *E6 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = *bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(&k)
+	}
 
 	var table [15]E6
 	var res E6
-	var k1, k2 fr.Element
+	var s1, s2 fr.Element
 
 	res.SetOne()
 
-	// table[b3b2b1b0-1] = b3b2*Frobinius(a) + b1b0*a
-	table[0].Set(a)
-	table[3].Frobenius(a)
+	// table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x
+	table[0].Set(x)
+	table[3].Frobenius(x)
 
 	// split the scalar, modifies +-x, Frob(x) accordingly
-	k := ecc.SplitScalar(s, &glvBasis)
+	s := ecc.SplitScalar(&e, &glvBasis)
 
-	if k[0].Sign() == -1 {
-		k[0].Neg(&k[0])
+	if s[0].Sign() == -1 {
+		s[0].Neg(&s[0])
 		table[0].InverseUnitary(&table[0])
 	}
-	if k[1].Sign() == -1 {
-		k[1].Neg(&k[1])
+	if s[1].Sign() == -1 {
+		s[1].Neg(&s[1])
 		table[3].InverseUnitary(&table[3])
 	}
 
 	// precompute table (2 bits sliding window)
-	// table[b3b2b1b0-1] = b3b2*Frobenius(a) + b1b0*a if b3b2b1b0 != 0
+	// table[b3b2b1b0-1] = b3b2*Frobenius(x) + b1b0*x if b3b2b1b0 != 0
 	table[1].CyclotomicSquare(&table[0])
 	table[2].Mul(&table[1], &table[0])
 	table[4].Mul(&table[3], &table[0])
@@ -447,17 +503,17 @@ func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 {
 	table[13].Mul(&table[11], &table[1])
 	table[14].Mul(&table[11], &table[2])
 
-	// bounds on the lattice base vectors guarantee that k1, k2 are len(r)/2 bits long max
-	k1.SetBigInt(&k[0]).FromMont()
-	k2.SetBigInt(&k[1]).FromMont()
+	// bounds on the lattice base vectors guarantee that s1, s2 are len(r)/2 bits long max
+	s1.SetBigInt(&s[0]).FromMont()
+	s2.SetBigInt(&s[1]).FromMont()
 
-	// loop starts from len(k1)/2 due to the bounds
-	for i := len(k1) / 2; i >= 0; i-- {
+	// loop starts from len(s1)/2 due to the bounds
+	for i := len(s1) / 2; i >= 0; i-- {
 		mask := uint64(3) << 62
 		for j := 0; j < 32; j++ {
 			res.CyclotomicSquare(&res).CyclotomicSquare(&res)
-			b1 := (k1[i] & mask) >> (62 - 2*j)
-			b2 := (k2[i] & mask) >> (62 - 2*j)
+			b1 := (s1[i] & mask) >> (62 - 2*j)
+			b2 := (s2[i] & mask) >> (62 - 2*j)
 			if b1|b2 != 0 {
 				s := (b2<<2 | b1)
 				res.Mul(&res, &table[s-1])
@@ -466,8 +522,8 @@ func (p *E6) ExpGLV(a *E6, s *big.Int) *E6 {
 		}
 	}
 
-	p.Set(&res)
-	return p
+	z.Set(&res)
+	return z
 }
 
 // InverseUnitary inverse a unitary element

From e382eb6c95f156abd0a5ecfca2856585e99d0c2b Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Mon, 20 Jun 2022 11:02:19 +0100
Subject: [PATCH 12/16] fix: golangci-lint

---
 ecc/bls12-377/internal/fptower/e12.go         |  54 +++----
 ecc/bls12-377/internal/fptower/e12_test.go    |   8 +-
 ecc/bls12-377/internal/fptower/e2.go          |  16 +-
 ecc/bls12-377/pairing_test.go                 |  20 +--
 ecc/bls12-378/internal/fptower/e12.go         |  54 +++----
 ecc/bls12-378/internal/fptower/e12_test.go    |   8 +-
 ecc/bls12-378/internal/fptower/e2.go          |  16 +-
 ecc/bls12-378/pairing_test.go                 |  20 +--
 ecc/bls12-381/internal/fptower/e12.go         |  54 +++----
 ecc/bls12-381/internal/fptower/e12_test.go    |   8 +-
 ecc/bls12-381/internal/fptower/e2.go          |  16 +-
 ecc/bls12-381/pairing_test.go                 |  20 +--
 ecc/bls24-315/internal/fptower/e12.go         |  42 ++++-
 ecc/bls24-315/internal/fptower/e12_test.go    |   2 +-
 ecc/bls24-315/internal/fptower/e2.go          |  16 +-
 ecc/bls24-315/internal/fptower/e24.go         | 126 +++++++--------
 ecc/bls24-315/internal/fptower/e24_test.go    |  10 +-
 ecc/bls24-315/internal/fptower/e4.go          |  39 +++--
 ecc/bls24-315/internal/fptower/e4_test.go     |   2 +-
 ecc/bls24-315/pairing_test.go                 |  20 +--
 ecc/bls24-317/internal/fptower/e12.go         |  42 ++++-
 ecc/bls24-317/internal/fptower/e12_test.go    |   2 +-
 ecc/bls24-317/internal/fptower/e2.go          |  16 +-
 ecc/bls24-317/internal/fptower/e24.go         | 126 +++++++--------
 ecc/bls24-317/internal/fptower/e24_test.go    |  10 +-
 ecc/bls24-317/internal/fptower/e4.go          |  41 +++--
 ecc/bls24-317/internal/fptower/e4_test.go     |   2 +-
 ecc/bls24-317/pairing_test.go                 |  20 +--
 ecc/bn254/internal/fptower/e12.go             |  54 +++----
 ecc/bn254/internal/fptower/e12_pairing.go     |   2 +-
 ecc/bn254/internal/fptower/e12_test.go        |   8 +-
 ecc/bn254/internal/fptower/e2.go              |  16 +-
 ecc/bn254/pairing_test.go                     |  20 +--
 ecc/bw6-633/internal/fptower/e6.go            | 152 +++++++++---------
 ecc/bw6-633/internal/fptower/e6_test.go       |   6 +-
 ecc/bw6-633/pairing_test.go                   |  20 +--
 ecc/bw6-756/internal/fptower/e6.go            | 120 +++++++-------
 ecc/bw6-756/internal/fptower/e6_test.go       |   2 +-
 ecc/bw6-756/pairing_test.go                   |  20 +--
 ecc/bw6-761/internal/fptower/e6.go            | 138 ++++++++--------
 ecc/bw6-761/internal/fptower/e6_test.go       |   6 +-
 ecc/bw6-761/pairing_test.go                   |  20 +--
 .../pairing/template/tests/pairing.go.tmpl    |  20 +--
 .../template/fq12over6over2/fq12.go.tmpl      |  56 +++----
 .../tower/template/fq12over6over2/fq2.go.tmpl |  22 +--
 .../fq12over6over2/tests/fq12.go.tmpl         |   8 +-
 46 files changed, 773 insertions(+), 727 deletions(-)

diff --git a/ecc/bls12-377/internal/fptower/e12.go b/ecc/bls12-377/internal/fptower/e12.go
index 2769e65952..723902e2bf 100644
--- a/ecc/bls12-377/internal/fptower/e12.go
+++ b/ecc/bls12-377/internal/fptower/e12.go
@@ -415,9 +415,9 @@ func BatchInvertE12(a []E12) []E12 {
 	return res
 }
 
-// Exp sets z=x**k and returns it
+// Exp sets z=xᵏ (mod q¹²) and returns it
 // uses 2-bits windowed method
-func (z *E12) Exp(x *E12, k big.Int) *E12 {
+func (z *E12) Exp(x E12, k *big.Int) *E12 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
@@ -426,20 +426,20 @@ func (z *E12) Exp(x *E12, k big.Int) *E12 {
 	if k.Sign() == -1 {
 		// negative k, we invert
 		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		x.Inverse(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var res E12
 	var ops [3]E12
 
 	res.SetOne()
-	ops[0].Set(x)
+	ops[0].Set(&x)
 	ops[1].Square(&ops[0])
 	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
 
@@ -461,37 +461,37 @@ func (z *E12) Exp(x *E12, k big.Int) *E12 {
 	return z
 }
 
-// CyclotomicExp sets z=x**k and returns it
+// CyclotomicExp sets z=xᵏ (mod q¹²) and returns it
 // uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
 // TODO: use a windowed method
-func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 {
+func (z *E12) CyclotomicExp(x E12, k *big.Int) *E12 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
 	e := k
 	if k.Sign() == -1 {
-		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// negative k, we invert (=conjugate)
+		// if k < 0: xᵏ (mod q¹²) == (x⁻¹)ᵏ (mod q¹²)
+		x.Conjugate(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var res, xInv E12
-	xInv.InverseUnitary(x)
+	xInv.InverseUnitary(&x)
 	res.SetOne()
 	eNAF := make([]int8, e.BitLen()+3)
-	n := ecc.NafDecomposition(&e, eNAF[:])
+	n := ecc.NafDecomposition(e, eNAF[:])
 	for i := n - 1; i >= 0; i-- {
 		res.CyclotomicSquare(&res)
 		if eNAF[i] == 1 {
-			res.Mul(&res, x)
+			res.Mul(&res, &x)
 		} else if eNAF[i] == -1 {
 			res.Mul(&res, &xInv)
 		}
@@ -500,27 +500,27 @@ func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 {
 	return z
 }
 
-// ExpGLV sets z=x**k and returns it
+// ExpGLV sets z=xᵏ (q¹²) and returns it
 // uses 2-dimensional GLV with 2-bits windowed method
 // x must be in GT
 // TODO: use 2-NAF
 // TODO: use higher dimensional decomposition
-func (z *E12) ExpGLV(x *E12, k big.Int) *E12 {
+func (z *E12) ExpGLV(x E12, k *big.Int) *E12 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
 	e := k
 	if k.Sign() == -1 {
-		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// negative k, we invert (=conjugate)
+		// if k < 0: xᵏ (mod q¹²) == (x⁻¹)ᵏ (mod q¹²)
+		x.Conjugate(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var table [15]E12
@@ -530,11 +530,11 @@ func (z *E12) ExpGLV(x *E12, k big.Int) *E12 {
 	res.SetOne()
 
 	// table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x
-	table[0].Set(x)
-	table[3].Frobenius(x)
+	table[0].Set(&x)
+	table[3].Frobenius(&x)
 
-	// split the scalar, modifies +-x, Frob(x) accordingly
-	s := ecc.SplitScalar(&e, &glvBasis)
+	// split the scalar, modifies ±x, Frob(x) accordingly
+	s := ecc.SplitScalar(e, &glvBasis)
 
 	if s[0].Sign() == -1 {
 		s[0].Neg(&s[0])
diff --git a/ecc/bls12-377/internal/fptower/e12_test.go b/ecc/bls12-377/internal/fptower/e12_test.go
index 3a7306a034..6fe75b79f2 100644
--- a/ecc/bls12-377/internal/fptower/e12_test.go
+++ b/ecc/bls12-377/internal/fptower/e12_test.go
@@ -391,8 +391,8 @@ func TestE12Ops(t *testing.T) {
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
 
-			c.Exp(a, _e)
-			d.CyclotomicExp(a, _e)
+			c.Exp(*a, &_e)
+			d.CyclotomicExp(*a, &_e)
 
 			return c.Equal(&d)
 		},
@@ -405,7 +405,7 @@ func TestE12Ops(t *testing.T) {
 			var b, c E12
 			q := fp.Modulus()
 			b.Frobenius(a)
-			c.Exp(a, *q)
+			c.Exp(*a, q)
 			return c.Equal(&b)
 		},
 		genA,
@@ -416,7 +416,7 @@ func TestE12Ops(t *testing.T) {
 			var b, c E12
 			q := fp.Modulus()
 			b.FrobeniusSquare(a)
-			c.Exp(a, *q).Exp(&c, *q)
+			c.Exp(*a, q).Exp(c, q)
 			return c.Equal(&b)
 		},
 		genA,
diff --git a/ecc/bls12-377/internal/fptower/e2.go b/ecc/bls12-377/internal/fptower/e2.go
index 32e5ac4bf9..fc300952ca 100644
--- a/ecc/bls12-377/internal/fptower/e2.go
+++ b/ecc/bls12-377/internal/fptower/e2.go
@@ -170,23 +170,23 @@ func (z *E2) Legendre() int {
 	return n.Legendre()
 }
 
-// Exp sets z=x**e and returns it
-func (z *E2) Exp(x E2, e *big.Int) *E2 {
-	if e.IsUint64() && e.Uint64() == 0 {
+// Exp sets z=xᵏ (mod q²) and returns it
+func (z *E2) Exp(x E2, k *big.Int) *E2 {
+	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
-	k := e
+	e := k
 	if k.Sign() == -1 {
 		// negative k, we invert
-		// if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12)
+		// if k < 0: xᵏ (mod q²) == (x⁻¹)ᵏ (mod q²)
 		x.Inverse(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		k = bigIntPool.Get().(*big.Int)
-		defer bigIntPool.Put(k)
-		k.Neg(k)
+		e = bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(k)
 	}
 
 	z.SetOne()
diff --git a/ecc/bls12-377/pairing_test.go b/ecc/bls12-377/pairing_test.go
index e678c58613..cfa761546c 100644
--- a/ecc/bls12-377/pairing_test.go
+++ b/ecc/bls12-377/pairing_test.go
@@ -76,9 +76,9 @@ func TestPairing(t *testing.T) {
 			e.ToBigIntRegular(&_e)
 
 			var b, c, d GT
-			b.Exp(&a, _e)
-			c.ExpGLV(&a, _e)
-			d.CyclotomicExp(&a, _e)
+			b.Exp(a, &_e)
+			c.ExpGLV(a, &_e)
+			d.CyclotomicExp(a, &_e)
 
 			return b.Equal(&c) && c.Equal(&d)
 		},
@@ -97,7 +97,7 @@ func TestPairing(t *testing.T) {
 				Mul(&a, &b)
 
 			c.Expt(&a).Expt(&c)
-			d.Exp(&a, xGen).Exp(&d, xGen)
+			d.Exp(a, &xGen).Exp(d, &xGen)
 			return c.Equal(&d)
 		},
 		genA,
@@ -124,9 +124,9 @@ func TestPairing(t *testing.T) {
 			resa, _ = Pair([]G1Affine{ag1}, []G2Affine{g2GenAff})
 			resb, _ = Pair([]G1Affine{g1GenAff}, []G2Affine{bg2})
 
-			resab.Exp(&res, ab)
-			resa.Exp(&resa, bbigint)
-			resb.Exp(&resb, abigint)
+			resab.Exp(res, &ab)
+			resa.Exp(resa, &bbigint)
+			resb.Exp(resb, &abigint)
 
 			return resab.Equal(&resa) && resab.Equal(&resb) && !res.Equal(&zero)
 
@@ -393,21 +393,21 @@ func BenchmarkExpGT(b *testing.B) {
 	b.Run("Naive windowed Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.Exp(&a, _e)
+			a.Exp(a, &_e)
 		}
 	})
 
 	b.Run("2-NAF cyclotomic Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.CyclotomicExp(&a, _e)
+			a.CyclotomicExp(a, &_e)
 		}
 	})
 
 	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.ExpGLV(&a, _e)
+			a.ExpGLV(a, &_e)
 		}
 	})
 }
diff --git a/ecc/bls12-378/internal/fptower/e12.go b/ecc/bls12-378/internal/fptower/e12.go
index 27f6285fdc..0169ee5054 100644
--- a/ecc/bls12-378/internal/fptower/e12.go
+++ b/ecc/bls12-378/internal/fptower/e12.go
@@ -415,9 +415,9 @@ func BatchInvertE12(a []E12) []E12 {
 	return res
 }
 
-// Exp sets z=x**k and returns it
+// Exp sets z=xᵏ (mod q¹²) and returns it
 // uses 2-bits windowed method
-func (z *E12) Exp(x *E12, k big.Int) *E12 {
+func (z *E12) Exp(x E12, k *big.Int) *E12 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
@@ -426,20 +426,20 @@ func (z *E12) Exp(x *E12, k big.Int) *E12 {
 	if k.Sign() == -1 {
 		// negative k, we invert
 		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		x.Inverse(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var res E12
 	var ops [3]E12
 
 	res.SetOne()
-	ops[0].Set(x)
+	ops[0].Set(&x)
 	ops[1].Square(&ops[0])
 	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
 
@@ -461,37 +461,37 @@ func (z *E12) Exp(x *E12, k big.Int) *E12 {
 	return z
 }
 
-// CyclotomicExp sets z=x**k and returns it
+// CyclotomicExp sets z=xᵏ (mod q¹²) and returns it
 // uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
 // TODO: use a windowed method
-func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 {
+func (z *E12) CyclotomicExp(x E12, k *big.Int) *E12 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
 	e := k
 	if k.Sign() == -1 {
-		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// negative k, we invert (=conjugate)
+		// if k < 0: xᵏ (mod q¹²) == (x⁻¹)ᵏ (mod q¹²)
+		x.Conjugate(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var res, xInv E12
-	xInv.InverseUnitary(x)
+	xInv.InverseUnitary(&x)
 	res.SetOne()
 	eNAF := make([]int8, e.BitLen()+3)
-	n := ecc.NafDecomposition(&e, eNAF[:])
+	n := ecc.NafDecomposition(e, eNAF[:])
 	for i := n - 1; i >= 0; i-- {
 		res.CyclotomicSquare(&res)
 		if eNAF[i] == 1 {
-			res.Mul(&res, x)
+			res.Mul(&res, &x)
 		} else if eNAF[i] == -1 {
 			res.Mul(&res, &xInv)
 		}
@@ -500,27 +500,27 @@ func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 {
 	return z
 }
 
-// ExpGLV sets z=x**k and returns it
+// ExpGLV sets z=xᵏ (q¹²) and returns it
 // uses 2-dimensional GLV with 2-bits windowed method
 // x must be in GT
 // TODO: use 2-NAF
 // TODO: use higher dimensional decomposition
-func (z *E12) ExpGLV(x *E12, k big.Int) *E12 {
+func (z *E12) ExpGLV(x E12, k *big.Int) *E12 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
 	e := k
 	if k.Sign() == -1 {
-		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// negative k, we invert (=conjugate)
+		// if k < 0: xᵏ (mod q¹²) == (x⁻¹)ᵏ (mod q¹²)
+		x.Conjugate(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var table [15]E12
@@ -530,11 +530,11 @@ func (z *E12) ExpGLV(x *E12, k big.Int) *E12 {
 	res.SetOne()
 
 	// table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x
-	table[0].Set(x)
-	table[3].Frobenius(x)
+	table[0].Set(&x)
+	table[3].Frobenius(&x)
 
-	// split the scalar, modifies +-x, Frob(x) accordingly
-	s := ecc.SplitScalar(&e, &glvBasis)
+	// split the scalar, modifies ±x, Frob(x) accordingly
+	s := ecc.SplitScalar(e, &glvBasis)
 
 	if s[0].Sign() == -1 {
 		s[0].Neg(&s[0])
diff --git a/ecc/bls12-378/internal/fptower/e12_test.go b/ecc/bls12-378/internal/fptower/e12_test.go
index 66eef6d80b..2ce5f01057 100644
--- a/ecc/bls12-378/internal/fptower/e12_test.go
+++ b/ecc/bls12-378/internal/fptower/e12_test.go
@@ -391,8 +391,8 @@ func TestE12Ops(t *testing.T) {
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
 
-			c.Exp(a, _e)
-			d.CyclotomicExp(a, _e)
+			c.Exp(*a, &_e)
+			d.CyclotomicExp(*a, &_e)
 
 			return c.Equal(&d)
 		},
@@ -405,7 +405,7 @@ func TestE12Ops(t *testing.T) {
 			var b, c E12
 			q := fp.Modulus()
 			b.Frobenius(a)
-			c.Exp(a, *q)
+			c.Exp(*a, q)
 			return c.Equal(&b)
 		},
 		genA,
@@ -416,7 +416,7 @@ func TestE12Ops(t *testing.T) {
 			var b, c E12
 			q := fp.Modulus()
 			b.FrobeniusSquare(a)
-			c.Exp(a, *q).Exp(&c, *q)
+			c.Exp(*a, q).Exp(c, q)
 			return c.Equal(&b)
 		},
 		genA,
diff --git a/ecc/bls12-378/internal/fptower/e2.go b/ecc/bls12-378/internal/fptower/e2.go
index 0223d0aac8..55fd82e0b5 100644
--- a/ecc/bls12-378/internal/fptower/e2.go
+++ b/ecc/bls12-378/internal/fptower/e2.go
@@ -170,23 +170,23 @@ func (z *E2) Legendre() int {
 	return n.Legendre()
 }
 
-// Exp sets z=x**e and returns it
-func (z *E2) Exp(x E2, e *big.Int) *E2 {
-	if e.IsUint64() && e.Uint64() == 0 {
+// Exp sets z=xᵏ (mod q²) and returns it
+func (z *E2) Exp(x E2, k *big.Int) *E2 {
+	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
-	k := e
+	e := k
 	if k.Sign() == -1 {
 		// negative k, we invert
-		// if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12)
+		// if k < 0: xᵏ (mod q²) == (x⁻¹)ᵏ (mod q²)
 		x.Inverse(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		k = bigIntPool.Get().(*big.Int)
-		defer bigIntPool.Put(k)
-		k.Neg(k)
+		e = bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(k)
 	}
 
 	z.SetOne()
diff --git a/ecc/bls12-378/pairing_test.go b/ecc/bls12-378/pairing_test.go
index c0a4e66eae..020ace09a2 100644
--- a/ecc/bls12-378/pairing_test.go
+++ b/ecc/bls12-378/pairing_test.go
@@ -76,9 +76,9 @@ func TestPairing(t *testing.T) {
 			e.ToBigIntRegular(&_e)
 
 			var b, c, d GT
-			b.Exp(&a, _e)
-			c.ExpGLV(&a, _e)
-			d.CyclotomicExp(&a, _e)
+			b.Exp(a, &_e)
+			c.ExpGLV(a, &_e)
+			d.CyclotomicExp(a, &_e)
 
 			return b.Equal(&c) && c.Equal(&d)
 		},
@@ -97,7 +97,7 @@ func TestPairing(t *testing.T) {
 				Mul(&a, &b)
 
 			c.Expt(&a).Expt(&c)
-			d.Exp(&a, xGen).Exp(&d, xGen)
+			d.Exp(a, &xGen).Exp(d, &xGen)
 			return c.Equal(&d)
 		},
 		genA,
@@ -124,9 +124,9 @@ func TestPairing(t *testing.T) {
 			resa, _ = Pair([]G1Affine{ag1}, []G2Affine{g2GenAff})
 			resb, _ = Pair([]G1Affine{g1GenAff}, []G2Affine{bg2})
 
-			resab.Exp(&res, ab)
-			resa.Exp(&resa, bbigint)
-			resb.Exp(&resb, abigint)
+			resab.Exp(res, &ab)
+			resa.Exp(resa, &bbigint)
+			resb.Exp(resb, &abigint)
 
 			return resab.Equal(&resa) && resab.Equal(&resb) && !res.Equal(&zero)
 
@@ -393,21 +393,21 @@ func BenchmarkExpGT(b *testing.B) {
 	b.Run("Naive windowed Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.Exp(&a, _e)
+			a.Exp(a, &_e)
 		}
 	})
 
 	b.Run("2-NAF cyclotomic Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.CyclotomicExp(&a, _e)
+			a.CyclotomicExp(a, &_e)
 		}
 	})
 
 	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.ExpGLV(&a, _e)
+			a.ExpGLV(a, &_e)
 		}
 	})
 }
diff --git a/ecc/bls12-381/internal/fptower/e12.go b/ecc/bls12-381/internal/fptower/e12.go
index 7ba4aa36ec..0eaa9f3df4 100644
--- a/ecc/bls12-381/internal/fptower/e12.go
+++ b/ecc/bls12-381/internal/fptower/e12.go
@@ -415,9 +415,9 @@ func BatchInvertE12(a []E12) []E12 {
 	return res
 }
 
-// Exp sets z=x**k and returns it
+// Exp sets z=xᵏ (mod q¹²) and returns it
 // uses 2-bits windowed method
-func (z *E12) Exp(x *E12, k big.Int) *E12 {
+func (z *E12) Exp(x E12, k *big.Int) *E12 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
@@ -426,20 +426,20 @@ func (z *E12) Exp(x *E12, k big.Int) *E12 {
 	if k.Sign() == -1 {
 		// negative k, we invert
 		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		x.Inverse(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var res E12
 	var ops [3]E12
 
 	res.SetOne()
-	ops[0].Set(x)
+	ops[0].Set(&x)
 	ops[1].Square(&ops[0])
 	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
 
@@ -461,37 +461,37 @@ func (z *E12) Exp(x *E12, k big.Int) *E12 {
 	return z
 }
 
-// CyclotomicExp sets z=x**k and returns it
+// CyclotomicExp sets z=xᵏ (mod q¹²) and returns it
 // uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
 // TODO: use a windowed method
-func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 {
+func (z *E12) CyclotomicExp(x E12, k *big.Int) *E12 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
 	e := k
 	if k.Sign() == -1 {
-		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// negative k, we invert (=conjugate)
+		// if k < 0: xᵏ (mod q¹²) == (x⁻¹)ᵏ (mod q¹²)
+		x.Conjugate(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var res, xInv E12
-	xInv.InverseUnitary(x)
+	xInv.InverseUnitary(&x)
 	res.SetOne()
 	eNAF := make([]int8, e.BitLen()+3)
-	n := ecc.NafDecomposition(&e, eNAF[:])
+	n := ecc.NafDecomposition(e, eNAF[:])
 	for i := n - 1; i >= 0; i-- {
 		res.CyclotomicSquare(&res)
 		if eNAF[i] == 1 {
-			res.Mul(&res, x)
+			res.Mul(&res, &x)
 		} else if eNAF[i] == -1 {
 			res.Mul(&res, &xInv)
 		}
@@ -500,27 +500,27 @@ func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 {
 	return z
 }
 
-// ExpGLV sets z=x**k and returns it
+// ExpGLV sets z=xᵏ (q¹²) and returns it
 // uses 2-dimensional GLV with 2-bits windowed method
 // x must be in GT
 // TODO: use 2-NAF
 // TODO: use higher dimensional decomposition
-func (z *E12) ExpGLV(x *E12, k big.Int) *E12 {
+func (z *E12) ExpGLV(x E12, k *big.Int) *E12 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
 	e := k
 	if k.Sign() == -1 {
-		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// negative k, we invert (=conjugate)
+		// if k < 0: xᵏ (mod q¹²) == (x⁻¹)ᵏ (mod q¹²)
+		x.Conjugate(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var table [15]E12
@@ -530,11 +530,11 @@ func (z *E12) ExpGLV(x *E12, k big.Int) *E12 {
 	res.SetOne()
 
 	// table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x
-	table[0].Set(x)
-	table[3].Frobenius(x)
+	table[0].Set(&x)
+	table[3].Frobenius(&x)
 
-	// split the scalar, modifies +-x, Frob(x) accordingly
-	s := ecc.SplitScalar(&e, &glvBasis)
+	// split the scalar, modifies ±x, Frob(x) accordingly
+	s := ecc.SplitScalar(e, &glvBasis)
 
 	if s[0].Sign() == -1 {
 		s[0].Neg(&s[0])
diff --git a/ecc/bls12-381/internal/fptower/e12_test.go b/ecc/bls12-381/internal/fptower/e12_test.go
index 46498b1ea5..0d5f9cd4ae 100644
--- a/ecc/bls12-381/internal/fptower/e12_test.go
+++ b/ecc/bls12-381/internal/fptower/e12_test.go
@@ -391,8 +391,8 @@ func TestE12Ops(t *testing.T) {
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
 
-			c.Exp(a, _e)
-			d.CyclotomicExp(a, _e)
+			c.Exp(*a, &_e)
+			d.CyclotomicExp(*a, &_e)
 
 			return c.Equal(&d)
 		},
@@ -405,7 +405,7 @@ func TestE12Ops(t *testing.T) {
 			var b, c E12
 			q := fp.Modulus()
 			b.Frobenius(a)
-			c.Exp(a, *q)
+			c.Exp(*a, q)
 			return c.Equal(&b)
 		},
 		genA,
@@ -416,7 +416,7 @@ func TestE12Ops(t *testing.T) {
 			var b, c E12
 			q := fp.Modulus()
 			b.FrobeniusSquare(a)
-			c.Exp(a, *q).Exp(&c, *q)
+			c.Exp(*a, q).Exp(c, q)
 			return c.Equal(&b)
 		},
 		genA,
diff --git a/ecc/bls12-381/internal/fptower/e2.go b/ecc/bls12-381/internal/fptower/e2.go
index a553e7fba5..6dcb1aca0e 100644
--- a/ecc/bls12-381/internal/fptower/e2.go
+++ b/ecc/bls12-381/internal/fptower/e2.go
@@ -170,23 +170,23 @@ func (z *E2) Legendre() int {
 	return n.Legendre()
 }
 
-// Exp sets z=x**e and returns it
-func (z *E2) Exp(x E2, e *big.Int) *E2 {
-	if e.IsUint64() && e.Uint64() == 0 {
+// Exp sets z=xᵏ (mod q²) and returns it
+func (z *E2) Exp(x E2, k *big.Int) *E2 {
+	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
-	k := e
+	e := k
 	if k.Sign() == -1 {
 		// negative k, we invert
-		// if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12)
+		// if k < 0: xᵏ (mod q²) == (x⁻¹)ᵏ (mod q²)
 		x.Inverse(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		k = bigIntPool.Get().(*big.Int)
-		defer bigIntPool.Put(k)
-		k.Neg(k)
+		e = bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(k)
 	}
 
 	z.SetOne()
diff --git a/ecc/bls12-381/pairing_test.go b/ecc/bls12-381/pairing_test.go
index 4e7d4e4d7c..5c45046b1c 100644
--- a/ecc/bls12-381/pairing_test.go
+++ b/ecc/bls12-381/pairing_test.go
@@ -76,9 +76,9 @@ func TestPairing(t *testing.T) {
 			e.ToBigIntRegular(&_e)
 
 			var b, c, d GT
-			b.Exp(&a, _e)
-			c.ExpGLV(&a, _e)
-			d.CyclotomicExp(&a, _e)
+			b.Exp(a, &_e)
+			c.ExpGLV(a, &_e)
+			d.CyclotomicExp(a, &_e)
 
 			return b.Equal(&c) && c.Equal(&d)
 		},
@@ -97,7 +97,7 @@ func TestPairing(t *testing.T) {
 				Mul(&a, &b)
 
 			c.Expt(&a).Expt(&c)
-			d.Exp(&a, xGen).Exp(&d, xGen)
+			d.Exp(a, &xGen).Exp(d, &xGen)
 			return c.Equal(&d)
 		},
 		genA,
@@ -124,9 +124,9 @@ func TestPairing(t *testing.T) {
 			resa, _ = Pair([]G1Affine{ag1}, []G2Affine{g2GenAff})
 			resb, _ = Pair([]G1Affine{g1GenAff}, []G2Affine{bg2})
 
-			resab.Exp(&res, ab)
-			resa.Exp(&resa, bbigint)
-			resb.Exp(&resb, abigint)
+			resab.Exp(res, &ab)
+			resa.Exp(resa, &bbigint)
+			resb.Exp(resb, &abigint)
 
 			return resab.Equal(&resa) && resab.Equal(&resb) && !res.Equal(&zero)
 
@@ -393,21 +393,21 @@ func BenchmarkExpGT(b *testing.B) {
 	b.Run("Naive windowed Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.Exp(&a, _e)
+			a.Exp(a, &_e)
 		}
 	})
 
 	b.Run("2-NAF cyclotomic Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.CyclotomicExp(&a, _e)
+			a.CyclotomicExp(a, &_e)
 		}
 	})
 
 	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.ExpGLV(&a, _e)
+			a.ExpGLV(a, &_e)
 		}
 	})
 }
diff --git a/ecc/bls24-315/internal/fptower/e12.go b/ecc/bls24-315/internal/fptower/e12.go
index 535122131d..faa9d387c8 100644
--- a/ecc/bls24-315/internal/fptower/e12.go
+++ b/ecc/bls24-315/internal/fptower/e12.go
@@ -240,23 +240,49 @@ func BatchInvertE12(a []E12) []E12 {
 	return res
 }
 
-// Exp sets z=x**e and returns it
-func (z *E12) Exp(x *E12, e big.Int) *E12 {
+// Exp sets z=xᵏ (mod q¹²) and returns it
+// uses 2-bits windowed method
+func (z *E12) Exp(x E12, k *big.Int) *E12 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(&x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(k)
+	}
+
 	var res E12
+	var ops [3]E12
+
 	res.SetOne()
+	ops[0].Set(&x)
+	ops[1].Square(&ops[0])
+	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
+
 	b := e.Bytes()
 	for i := range b {
 		w := b[i]
-		mask := byte(0x80)
-		for j := 7; j >= 0; j-- {
-			res.Square(&res)
-			if (w&mask)>>j != 0 {
-				res.Mul(&res, x)
+		mask := byte(0xc0)
+		for j := 0; j < 4; j++ {
+			res.Square(&res).Square(&res)
+			c := (w & mask) >> (6 - 2*j)
+			if c != 0 {
+				res.Mul(&res, &ops[c-1])
 			}
-			mask = mask >> 1
+			mask = mask >> 2
 		}
 	}
 	z.Set(&res)
+
 	return z
 }
 
diff --git a/ecc/bls24-315/internal/fptower/e12_test.go b/ecc/bls24-315/internal/fptower/e12_test.go
index ee431c5bc0..c4cec4957f 100644
--- a/ecc/bls24-315/internal/fptower/e12_test.go
+++ b/ecc/bls24-315/internal/fptower/e12_test.go
@@ -249,6 +249,6 @@ func BenchmarkE12ExpBySeed(b *testing.B) {
 	_, _ = a.SetRandom()
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		a.Exp(&a, seed).Conjugate(&a)
+		a.Exp(a, &seed).Conjugate(&a)
 	}
 }
diff --git a/ecc/bls24-315/internal/fptower/e2.go b/ecc/bls24-315/internal/fptower/e2.go
index 774c4c1d48..de62535878 100644
--- a/ecc/bls24-315/internal/fptower/e2.go
+++ b/ecc/bls24-315/internal/fptower/e2.go
@@ -163,23 +163,23 @@ func (z *E2) Legendre() int {
 	return n.Legendre()
 }
 
-// Exp sets z=x**e and returns it
-func (z *E2) Exp(x E2, e *big.Int) *E2 {
-	if e.IsUint64() && e.Uint64() == 0 {
+// Exp sets z=xᵏ (mod q²) and returns it
+func (z *E2) Exp(x E2, k *big.Int) *E2 {
+	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
-	k := e
+	e := k
 	if k.Sign() == -1 {
 		// negative k, we invert
-		// if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12)
+		// if k < 0: xᵏ (mod q²) == (x⁻¹)ᵏ (mod q²)
 		x.Inverse(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		k = bigIntPool.Get().(*big.Int)
-		defer bigIntPool.Put(k)
-		k.Neg(k)
+		e = bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(k)
 	}
 
 	z.SetOne()
diff --git a/ecc/bls24-315/internal/fptower/e24.go b/ecc/bls24-315/internal/fptower/e24.go
index a935af6e11..043b85e0da 100644
--- a/ecc/bls24-315/internal/fptower/e24.go
+++ b/ecc/bls24-315/internal/fptower/e24.go
@@ -153,25 +153,25 @@ func (z *E24) CyclotomicSquareCompressed(x *E24) *E24 {
 
 	var t [7]E4
 
-	// t0 = g1^2
+	// t0 = g1²
 	t[0].Square(&x.D0.C1)
-	// t1 = g5^2
+	// t1 = g5²
 	t[1].Square(&x.D1.C2)
 	// t5 = g1 + g5
 	t[5].Add(&x.D0.C1, &x.D1.C2)
-	// t2 = (g1 + g5)^2
+	// t2 = (g1 + g5)²
 	t[2].Square(&t[5])
 
-	// t3 = g1^2 + g5^2
+	// t3 = g1² + g5²
 	t[3].Add(&t[0], &t[1])
 	// t5 = 2 * g1 * g5
 	t[5].Sub(&t[2], &t[3])
 
 	// t6 = g3 + g2
 	t[6].Add(&x.D1.C0, &x.D0.C2)
-	// t3 = (g3 + g2)^2
+	// t3 = (g3 + g2)²
 	t[3].Square(&t[6])
-	// t2 = g3^2
+	// t2 = g3²
 	t[2].Square(&x.D1.C0)
 
 	// t6 = 2 * nr * g1 * g5
@@ -182,33 +182,33 @@ func (z *E24) CyclotomicSquareCompressed(x *E24) *E24 {
 	// z3 = 6 * nr * g1 * g5 + 2 * g3
 	z.D1.C0.Add(&t[5], &t[6])
 
-	// t4 = nr * g5^2
+	// t4 = nr * g5²
 	t[4].MulByNonResidue(&t[1])
-	// t5 = nr * g5^2 + g1^2
+	// t5 = nr * g5² + g1²
 	t[5].Add(&t[0], &t[4])
-	// t6 = nr * g5^2 + g1^2 - g2
+	// t6 = nr * g5² + g1² - g2
 	t[6].Sub(&t[5], &x.D0.C2)
 
-	// t1 = g2^2
+	// t1 = g2²
 	t[1].Square(&x.D0.C2)
 
-	// t6 = 2 * nr * g5^2 + 2 * g1^2 - 2*g2
+	// t6 = 2 * nr * g5² + 2 * g1² - 2*g2
 	t[6].Double(&t[6])
-	// z2 = 3 * nr * g5^2 + 3 * g1^2 - 2*g2
+	// z2 = 3 * nr * g5² + 3 * g1² - 2*g2
 	z.D0.C2.Add(&t[6], &t[5])
 
-	// t4 = nr * g2^2
+	// t4 = nr * g2²
 	t[4].MulByNonResidue(&t[1])
-	// t5 = g3^2 + nr * g2^2
+	// t5 = g3² + nr * g2²
 	t[5].Add(&t[2], &t[4])
-	// t6 = g3^2 + nr * g2^2 - g1
+	// t6 = g3² + nr * g2² - g1
 	t[6].Sub(&t[5], &x.D0.C1)
-	// t6 = 2 * g3^2 + 2 * nr * g2^2 - 2 * g1
+	// t6 = 2 * g3² + 2 * nr * g2² - 2 * g1
 	t[6].Double(&t[6])
-	// z1 = 3 * g3^2 + 3 * nr * g2^2 - 2 * g1
+	// z1 = 3 * g3² + 3 * nr * g2² - 2 * g1
 	z.D0.C1.Add(&t[6], &t[5])
 
-	// t0 = g2^2 + g3^2
+	// t0 = g2² + g3²
 	t[0].Add(&t[2], &t[1])
 	// t5 = 2 * g3 * g2
 	t[5].Sub(&t[3], &t[0])
@@ -229,13 +229,13 @@ func (z *E24) DecompressKarabina(x *E24) *E24 {
 	var one E4
 	one.SetOne()
 
-	// t0 = g1^2
+	// t0 = g1²
 	t[0].Square(&x.D0.C1)
-	// t1 = 3 * g1^2 - 2 * g2
+	// t1 = 3 * g1² - 2 * g2
 	t[1].Sub(&t[0], &x.D0.C2).
 		Double(&t[1]).
 		Add(&t[1], &t[0])
-		// t0 = E * g5^2 + t1
+		// t0 = E * g5² + t1
 	t[2].Square(&x.D1.C2)
 	t[0].MulByNonResidue(&t[2]).
 		Add(&t[0], &t[1])
@@ -248,14 +248,14 @@ func (z *E24) DecompressKarabina(x *E24) *E24 {
 
 	// t1 = g2 * g1
 	t[1].Mul(&x.D0.C2, &x.D0.C1)
-	// t2 = 2 * g4^2 - 3 * g2 * g1
+	// t2 = 2 * g4² - 3 * g2 * g1
 	t[2].Square(&x.D1.C1).
 		Sub(&t[2], &t[1]).
 		Double(&t[2]).
 		Sub(&t[2], &t[1])
 	// t1 = g3 * g5
 	t[1].Mul(&x.D1.C0, &x.D1.C2)
-	// c_0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1
+	// c₀ = E * (2 * g4² + g3 * g5 - 3 * g2 * g1) + 1
 	t[2].Add(&t[2], &t[1])
 	z.D0.C0.MulByNonResidue(&t[2]).
 		Add(&z.D0.C0, &one)
@@ -284,13 +284,13 @@ func BatchDecompressKarabina(x []E24) []E24 {
 	one.SetOne()
 
 	for i := 0; i < n; i++ {
-		// t0 = g1^2
+		// t0 = g1²
 		t0[i].Square(&x[i].D0.C1)
-		// t1 = 3 * g1^2 - 2 * g2
+		// t1 = 3 * g1² - 2 * g2
 		t1[i].Sub(&t0[i], &x[i].D0.C2).
 			Double(&t1[i]).
 			Add(&t1[i], &t0[i])
-			// t0 = E * g5^2 + t1
+			// t0 = E * g5² + t1
 		t2[i].Square(&x[i].D1.C2)
 		t0[i].MulByNonResidue(&t2[i]).
 			Add(&t0[i], &t1[i])
@@ -307,7 +307,7 @@ func BatchDecompressKarabina(x []E24) []E24 {
 
 		// t1 = g2 * g1
 		t1[i].Mul(&x[i].D0.C2, &x[i].D0.C1)
-		// t2 = 2 * g4^2 - 3 * g2 * g1
+		// t2 = 2 * g4² - 3 * g2 * g1
 		t2[i].Square(&x[i].D1.C1)
 		t2[i].Sub(&t2[i], &t1[i])
 		t2[i].Double(&t2[i])
@@ -315,7 +315,7 @@ func BatchDecompressKarabina(x []E24) []E24 {
 
 		// t1 = g3 * g5
 		t1[i].Mul(&x[i].D1.C0, &x[i].D1.C2)
-		// z0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1
+		// z0 = E * (2 * g4² + g3 * g5 - 3 * g2 * g1) + 1
 		t2[i].Add(&t2[i], &t1[i])
 		x[i].D0.C0.MulByNonResidue(&t2[i]).
 			Add(&x[i].D0.C0, &one)
@@ -328,10 +328,10 @@ func BatchDecompressKarabina(x []E24) []E24 {
 // https://eprint.iacr.org/2009/565.pdf, 3.2
 func (z *E24) CyclotomicSquare(x *E24) *E24 {
 
-	// x=(x0,x1,x2,x3,x4,x5,x6,x7) in E4^6
-	// cyclosquare(x)=(3*x4^2*u + 3*x0^2 - 2*x0,
-	//					3*x2^2*u + 3*x3^2 - 2*x1,
-	//					3*x5^2*u + 3*x1^2 - 2*x2,
+	// x=(x0,x1,x2,x3,x4,x5,x6,x7) in E4⁶
+	// cyclosquare(x)=(3*x4²*u + 3*x0² - 2*x0,
+	//					3*x2²*u + 3*x3² - 2*x1,
+	//					3*x5²*u + 3*x1² - 2*x2,
 	//					6*x1*x5*u + 2*x3,
 	//					6*x0*x4 + 2*x4,
 	//					6*x2*x3 + 2*x5)
@@ -348,9 +348,9 @@ func (z *E24) CyclotomicSquare(x *E24) *E24 {
 	t[5].Square(&x.D0.C1)
 	t[8].Add(&x.D1.C2, &x.D0.C1).Square(&t[8]).Sub(&t[8], &t[4]).Sub(&t[8], &t[5]).MulByNonResidue(&t[8]) // 2*x5*x1*u
 
-	t[0].MulByNonResidue(&t[0]).Add(&t[0], &t[1]) // x4^2*u + x0^2
-	t[2].MulByNonResidue(&t[2]).Add(&t[2], &t[3]) // x2^2*u + x3^2
-	t[4].MulByNonResidue(&t[4]).Add(&t[4], &t[5]) // x5^2*u + x1^2
+	t[0].MulByNonResidue(&t[0]).Add(&t[0], &t[1]) // x4²*u + x0²
+	t[2].MulByNonResidue(&t[2]).Add(&t[2], &t[3]) // x2²*u + x3²
+	t[4].MulByNonResidue(&t[4]).Add(&t[4], &t[5]) // x5²*u + x1²
 
 	z.D0.C0.Sub(&t[0], &x.D0.C0).Double(&z.D0.C0).Add(&z.D0.C0, &t[0])
 	z.D0.C1.Sub(&t[2], &x.D0.C1).Double(&z.D0.C1).Add(&z.D0.C1, &t[2])
@@ -413,9 +413,9 @@ func BatchInvertE24(a []E24) []E24 {
 	return res
 }
 
-// Exp sets z=x**k and returns it
+// Exp sets z=xᵏ (mod q²⁴) and returns it
 // uses 2-bits windowed method
-func (z *E24) Exp(x *E24, k big.Int) *E24 {
+func (z *E24) Exp(x E24, k *big.Int) *E24 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
@@ -424,20 +424,20 @@ func (z *E24) Exp(x *E24, k big.Int) *E24 {
 	if k.Sign() == -1 {
 		// negative k, we invert
 		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		x.Inverse(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var res E24
 	var ops [3]E24
 
 	res.SetOne()
-	ops[0].Set(x)
+	ops[0].Set(&x)
 	ops[1].Square(&ops[0])
 	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
 
@@ -459,37 +459,37 @@ func (z *E24) Exp(x *E24, k big.Int) *E24 {
 	return z
 }
 
-// CyclotomicExp sets z=x**k and returns it
+// CyclotomicExp sets z=xᵏ (mod q²⁴) and returns it
 // uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
 // TODO: use a windowed method
-func (z *E24) CyclotomicExp(x *E24, k big.Int) *E24 {
+func (z *E24) CyclotomicExp(x E24, k *big.Int) *E24 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
 	e := k
 	if k.Sign() == -1 {
-		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// negative k, we invert (=conjugate)
+		// if k < 0: xᵏ (mod q²⁴) == (x⁻¹)ᵏ (mod q²⁴)
+		x.Conjugate(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var res, xInv E24
-	xInv.InverseUnitary(x)
+	xInv.InverseUnitary(&x)
 	res.SetOne()
 	eNAF := make([]int8, e.BitLen()+3)
-	n := ecc.NafDecomposition(&e, eNAF[:])
+	n := ecc.NafDecomposition(e, eNAF[:])
 	for i := n - 1; i >= 0; i-- {
 		res.CyclotomicSquare(&res)
 		if eNAF[i] == 1 {
-			res.Mul(&res, x)
+			res.Mul(&res, &x)
 		} else if eNAF[i] == -1 {
 			res.Mul(&res, &xInv)
 		}
@@ -498,12 +498,12 @@ func (z *E24) CyclotomicExp(x *E24, k big.Int) *E24 {
 	return z
 }
 
-// ExpGLV sets z=x**k and returns it
+// ExpGLV sets z=xᵏ (q²⁴) and returns it
 // uses 2-dimensional GLV with 2-bits windowed method
 // x must be in GT
 // TODO: use 2-NAF
 // TODO: use higher dimensional decomposition
-func (z *E24) ExpGLV(x *E24, k big.Int) *E24 {
+func (z *E24) ExpGLV(x E24, k *big.Int) *E24 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
@@ -511,14 +511,14 @@ func (z *E24) ExpGLV(x *E24, k big.Int) *E24 {
 	e := k
 	if k.Sign() == -1 {
 		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// if k < 0: xᵏ (mod q²⁴) == (x⁻¹)ᵏ (mod q²⁴)
+		x.Conjugate(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var table [15]E24
@@ -528,11 +528,11 @@ func (z *E24) ExpGLV(x *E24, k big.Int) *E24 {
 	res.SetOne()
 
 	// table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x
-	table[0].Set(x)
-	table[3].Frobenius(x)
+	table[0].Set(&x)
+	table[3].Frobenius(&x)
 
-	// split the scalar, modifies +-x, Frob(x) accordingly
-	s := ecc.SplitScalar(&e, &glvBasis)
+	// split the scalar, modifies ±x, Frob(x) accordingly
+	s := ecc.SplitScalar(e, &glvBasis)
 
 	if s[0].Sign() == -1 {
 		s[0].Neg(&s[0])
@@ -793,10 +793,10 @@ func (z *E24) IsInSubGroup() bool {
 
 // CompressTorus GT/E24 element to half its size
 // z must be in the cyclotomic subgroup
-// i.e. z^(p^4-p^2+1)=1
+// i.e. z^(p⁴-p²+1)=1
 // e.g. GT
 // "COMPRESSION IN FINITE FIELDS AND TORUS-BASED CRYPTOGRAPHY", K. RUBIN AND A. SILVERBERG
-// z.C1 == 0 only when z \in {-1,1}
+// z.C1 == 0 only when z ∈ {-1,1}
 func (z *E24) CompressTorus() (E12, error) {
 
 	if z.D1.IsZero() {
diff --git a/ecc/bls24-315/internal/fptower/e24_test.go b/ecc/bls24-315/internal/fptower/e24_test.go
index 227b78300e..ff70344d14 100644
--- a/ecc/bls24-315/internal/fptower/e24_test.go
+++ b/ecc/bls24-315/internal/fptower/e24_test.go
@@ -422,8 +422,8 @@ func TestE24Ops(t *testing.T) {
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
 
-			c.Exp(a, _e)
-			d.CyclotomicExp(a, _e)
+			c.Exp(*a, &_e)
+			d.CyclotomicExp(*a, &_e)
 
 			return c.Equal(&d)
 		},
@@ -437,7 +437,7 @@ func TestE24Ops(t *testing.T) {
 			q := fp.Modulus()
 			b.Frobenius(a)
 			c.Set(a)
-			c.Exp(&c, *q)
+			c.Exp(c, q)
 			return c.Equal(&b)
 		},
 		genA,
@@ -448,7 +448,7 @@ func TestE24Ops(t *testing.T) {
 			var b, c E24
 			q := fp.Modulus()
 			b.FrobeniusSquare(a)
-			c.Exp(a, *q).Exp(&c, *q)
+			c.Exp(*a, q).Exp(c, q)
 			return c.Equal(&b)
 		},
 		genA,
@@ -459,7 +459,7 @@ func TestE24Ops(t *testing.T) {
 			var b, c E24
 			q := fp.Modulus()
 			b.FrobeniusQuad(a)
-			c.Exp(a, *q).Exp(&c, *q).Exp(&c, *q).Exp(&c, *q)
+			c.Exp(*a, q).Exp(c, q).Exp(c, q).Exp(c, q)
 			return c.Equal(&b)
 		},
 		genA,
diff --git a/ecc/bls24-315/internal/fptower/e4.go b/ecc/bls24-315/internal/fptower/e4.go
index d73f787dd1..34fe0659c4 100644
--- a/ecc/bls24-315/internal/fptower/e4.go
+++ b/ecc/bls24-315/internal/fptower/e4.go
@@ -215,40 +215,37 @@ func (z *E4) Inverse(x *E4) *E4 {
 	return z
 }
 
-// Exp sets z=x**e and returns it
-func (z *E4) Exp(x *E4, e big.Int) *E4 {
-	if e.IsUint64() && e.Uint64() == 0 {
+// Exp sets z=xᵏ (mod q⁴) and returns it
+func (z *E4) Exp(x E4, k *big.Int) *E4 {
+	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
-	k := e
+	e := k
 	if k.Sign() == -1 {
 		// negative k, we invert
-		// if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12)
-		x.Inverse(x)
+		// if k < 0: xᵏ (mod q⁴) == (x⁻¹)ᵏ (mod q⁴)
+		x.Inverse(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		k = *bigIntPool.Get().(*big.Int)
-		defer bigIntPool.Put(k)
-		k.Neg(&k)
+		e = bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(k)
 	}
 
-	var res E4
-	res.SetOne()
+	z.SetOne()
 	b := e.Bytes()
-	for i := range b {
+	for i := 0; i < len(b); i++ {
 		w := b[i]
-		mask := byte(0x80)
-		for j := 7; j >= 0; j-- {
-			res.Square(&res)
-			if (w&mask)>>j != 0 {
-				res.Mul(&res, x)
+		for j := 0; j < 8; j++ {
+			z.Square(z)
+			if (w & (0b10000000 >> j)) != 0 {
+				z.Mul(z, &x)
 			}
-			mask = mask >> 1
 		}
 	}
-	z.Set(&res)
+
 	return z
 }
 
@@ -299,13 +296,13 @@ func (z *E4) Sqrt(x *E4) *E4 {
 	var exp, one big.Int
 	one.SetUint64(1)
 	exp.Mul(q, q).Sub(&exp, &one).Rsh(&exp, 1)
-	d.Exp(&c, exp)
+	d.Exp(c, &exp)
 	e.Mul(&d, &c).Inverse(&e)
 	f.Mul(&d, &c).Square(&f)
 
 	// computation
 	exp.Rsh(&exp, 1)
-	b.Exp(x, exp)
+	b.Exp(*x, &exp)
 	b.norm(&_b)
 	o.SetOne()
 	if _b.Equal(&o) {
diff --git a/ecc/bls24-315/internal/fptower/e4_test.go b/ecc/bls24-315/internal/fptower/e4_test.go
index d84267b2c2..a3a52e8f34 100644
--- a/ecc/bls24-315/internal/fptower/e4_test.go
+++ b/ecc/bls24-315/internal/fptower/e4_test.go
@@ -259,7 +259,7 @@ func TestE4Ops(t *testing.T) {
 			var b, c E4
 			q := fp.Modulus()
 			b.Frobenius(a)
-			c.Exp(a, *q)
+			c.Exp(*a, q)
 			return c.Equal(&b)
 		},
 		genA,
diff --git a/ecc/bls24-315/pairing_test.go b/ecc/bls24-315/pairing_test.go
index bc1d1383dc..e0102f8864 100644
--- a/ecc/bls24-315/pairing_test.go
+++ b/ecc/bls24-315/pairing_test.go
@@ -78,9 +78,9 @@ func TestPairing(t *testing.T) {
 			e.ToBigIntRegular(&_e)
 
 			var b, c, d GT
-			b.Exp(&a, _e)
-			c.ExpGLV(&a, _e)
-			d.CyclotomicExp(&a, _e)
+			b.Exp(a, &_e)
+			c.ExpGLV(a, &_e)
+			d.CyclotomicExp(a, &_e)
 
 			return b.Equal(&c) && c.Equal(&d)
 		},
@@ -99,7 +99,7 @@ func TestPairing(t *testing.T) {
 				Mul(&a, &b)
 
 			c.Expt(&a).Expt(&c)
-			d.Exp(&a, xGen).Exp(&d, xGen)
+			d.Exp(a, &xGen).Exp(d, &xGen)
 			return c.Equal(&d)
 		},
 		genA,
@@ -126,9 +126,9 @@ func TestPairing(t *testing.T) {
 			resa, _ = Pair([]G1Affine{ag1}, []G2Affine{g2GenAff})
 			resb, _ = Pair([]G1Affine{g1GenAff}, []G2Affine{bg2})
 
-			resab.Exp(&res, ab)
-			resa.Exp(&resa, bbigint)
-			resb.Exp(&resb, abigint)
+			resab.Exp(res, &ab)
+			resa.Exp(resa, &bbigint)
+			resb.Exp(resb, &abigint)
 
 			return resab.Equal(&resa) && resab.Equal(&resb) && !res.Equal(&zero)
 
@@ -396,21 +396,21 @@ func BenchmarkExpGT(b *testing.B) {
 	b.Run("Naive windowed Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.Exp(&a, _e)
+			a.Exp(a, &_e)
 		}
 	})
 
 	b.Run("2-NAF cyclotomic Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.CyclotomicExp(&a, _e)
+			a.CyclotomicExp(a, &_e)
 		}
 	})
 
 	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.ExpGLV(&a, _e)
+			a.ExpGLV(a, &_e)
 		}
 	})
 }
diff --git a/ecc/bls24-317/internal/fptower/e12.go b/ecc/bls24-317/internal/fptower/e12.go
index 315432f75b..785fea776d 100644
--- a/ecc/bls24-317/internal/fptower/e12.go
+++ b/ecc/bls24-317/internal/fptower/e12.go
@@ -240,23 +240,49 @@ func BatchInvertE12(a []E12) []E12 {
 	return res
 }
 
-// Exp sets z=x**e and returns it
-func (z *E12) Exp(x *E12, e big.Int) *E12 {
+// Exp sets z=xᵏ (mod q¹²) and returns it
+// uses 2-bits windowed method
+func (z *E12) Exp(x E12, k *big.Int) *E12 {
+	if k.IsUint64() && k.Uint64() == 0 {
+		return z.SetOne()
+	}
+
+	e := k
+	if k.Sign() == -1 {
+		// negative k, we invert
+		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
+		x.Inverse(&x)
+
+		// we negate k in a temp big.Int since
+		// Int.Bit(_) of k and -k is different
+		e = bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(k)
+	}
+
 	var res E12
+	var ops [3]E12
+
 	res.SetOne()
+	ops[0].Set(&x)
+	ops[1].Square(&ops[0])
+	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
+
 	b := e.Bytes()
 	for i := range b {
 		w := b[i]
-		mask := byte(0x80)
-		for j := 7; j >= 0; j-- {
-			res.Square(&res)
-			if (w&mask)>>j != 0 {
-				res.Mul(&res, x)
+		mask := byte(0xc0)
+		for j := 0; j < 4; j++ {
+			res.Square(&res).Square(&res)
+			c := (w & mask) >> (6 - 2*j)
+			if c != 0 {
+				res.Mul(&res, &ops[c-1])
 			}
-			mask = mask >> 1
+			mask = mask >> 2
 		}
 	}
 	z.Set(&res)
+
 	return z
 }
 
diff --git a/ecc/bls24-317/internal/fptower/e12_test.go b/ecc/bls24-317/internal/fptower/e12_test.go
index 19adb05dd2..76d4a6c9e0 100644
--- a/ecc/bls24-317/internal/fptower/e12_test.go
+++ b/ecc/bls24-317/internal/fptower/e12_test.go
@@ -248,6 +248,6 @@ func BenchmarkE12ExpBySeed(b *testing.B) {
 	_, _ = a.SetRandom()
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		a.Exp(&a, seed).Conjugate(&a)
+		a.Exp(a, &seed).Conjugate(&a)
 	}
 }
diff --git a/ecc/bls24-317/internal/fptower/e2.go b/ecc/bls24-317/internal/fptower/e2.go
index f5d018924f..688d71776b 100644
--- a/ecc/bls24-317/internal/fptower/e2.go
+++ b/ecc/bls24-317/internal/fptower/e2.go
@@ -162,23 +162,23 @@ func (z *E2) Legendre() int {
 	return n.Legendre()
 }
 
-// Exp sets z=x**e and returns it
-func (z *E2) Exp(x E2, e *big.Int) *E2 {
-	if e.IsUint64() && e.Uint64() == 0 {
+// Exp sets z=xᵏ (mod q²) and returns it
+func (z *E2) Exp(x E2, k *big.Int) *E2 {
+	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
-	k := e
+	e := k
 	if k.Sign() == -1 {
 		// negative k, we invert
-		// if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12)
+		// if k < 0: xᵏ (mod q²) == (x⁻¹)ᵏ (mod q²)
 		x.Inverse(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		k = bigIntPool.Get().(*big.Int)
-		defer bigIntPool.Put(k)
-		k.Neg(k)
+		e = bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(k)
 	}
 
 	z.SetOne()
diff --git a/ecc/bls24-317/internal/fptower/e24.go b/ecc/bls24-317/internal/fptower/e24.go
index f25d66539d..c29a23021e 100644
--- a/ecc/bls24-317/internal/fptower/e24.go
+++ b/ecc/bls24-317/internal/fptower/e24.go
@@ -153,25 +153,25 @@ func (z *E24) CyclotomicSquareCompressed(x *E24) *E24 {
 
 	var t [7]E4
 
-	// t0 = g1^2
+	// t0 = g1²
 	t[0].Square(&x.D0.C1)
-	// t1 = g5^2
+	// t1 = g5²
 	t[1].Square(&x.D1.C2)
 	// t5 = g1 + g5
 	t[5].Add(&x.D0.C1, &x.D1.C2)
-	// t2 = (g1 + g5)^2
+	// t2 = (g1 + g5)²
 	t[2].Square(&t[5])
 
-	// t3 = g1^2 + g5^2
+	// t3 = g1² + g5²
 	t[3].Add(&t[0], &t[1])
 	// t5 = 2 * g1 * g5
 	t[5].Sub(&t[2], &t[3])
 
 	// t6 = g3 + g2
 	t[6].Add(&x.D1.C0, &x.D0.C2)
-	// t3 = (g3 + g2)^2
+	// t3 = (g3 + g2)²
 	t[3].Square(&t[6])
-	// t2 = g3^2
+	// t2 = g3²
 	t[2].Square(&x.D1.C0)
 
 	// t6 = 2 * nr * g1 * g5
@@ -182,33 +182,33 @@ func (z *E24) CyclotomicSquareCompressed(x *E24) *E24 {
 	// z3 = 6 * nr * g1 * g5 + 2 * g3
 	z.D1.C0.Add(&t[5], &t[6])
 
-	// t4 = nr * g5^2
+	// t4 = nr * g5²
 	t[4].MulByNonResidue(&t[1])
-	// t5 = nr * g5^2 + g1^2
+	// t5 = nr * g5² + g1²
 	t[5].Add(&t[0], &t[4])
-	// t6 = nr * g5^2 + g1^2 - g2
+	// t6 = nr * g5² + g1² - g2
 	t[6].Sub(&t[5], &x.D0.C2)
 
-	// t1 = g2^2
+	// t1 = g2²
 	t[1].Square(&x.D0.C2)
 
-	// t6 = 2 * nr * g5^2 + 2 * g1^2 - 2*g2
+	// t6 = 2 * nr * g5² + 2 * g1² - 2*g2
 	t[6].Double(&t[6])
-	// z2 = 3 * nr * g5^2 + 3 * g1^2 - 2*g2
+	// z2 = 3 * nr * g5² + 3 * g1² - 2*g2
 	z.D0.C2.Add(&t[6], &t[5])
 
-	// t4 = nr * g2^2
+	// t4 = nr * g2²
 	t[4].MulByNonResidue(&t[1])
-	// t5 = g3^2 + nr * g2^2
+	// t5 = g3² + nr * g2²
 	t[5].Add(&t[2], &t[4])
-	// t6 = g3^2 + nr * g2^2 - g1
+	// t6 = g3² + nr * g2² - g1
 	t[6].Sub(&t[5], &x.D0.C1)
-	// t6 = 2 * g3^2 + 2 * nr * g2^2 - 2 * g1
+	// t6 = 2 * g3² + 2 * nr * g2² - 2 * g1
 	t[6].Double(&t[6])
-	// z1 = 3 * g3^2 + 3 * nr * g2^2 - 2 * g1
+	// z1 = 3 * g3² + 3 * nr * g2² - 2 * g1
 	z.D0.C1.Add(&t[6], &t[5])
 
-	// t0 = g2^2 + g3^2
+	// t0 = g2² + g3²
 	t[0].Add(&t[2], &t[1])
 	// t5 = 2 * g3 * g2
 	t[5].Sub(&t[3], &t[0])
@@ -229,13 +229,13 @@ func (z *E24) DecompressKarabina(x *E24) *E24 {
 	var one E4
 	one.SetOne()
 
-	// t0 = g1^2
+	// t0 = g1²
 	t[0].Square(&x.D0.C1)
-	// t1 = 3 * g1^2 - 2 * g2
+	// t1 = 3 * g1² - 2 * g2
 	t[1].Sub(&t[0], &x.D0.C2).
 		Double(&t[1]).
 		Add(&t[1], &t[0])
-		// t0 = E * g5^2 + t1
+		// t0 = E * g5² + t1
 	t[2].Square(&x.D1.C2)
 	t[0].MulByNonResidue(&t[2]).
 		Add(&t[0], &t[1])
@@ -248,14 +248,14 @@ func (z *E24) DecompressKarabina(x *E24) *E24 {
 
 	// t1 = g2 * g1
 	t[1].Mul(&x.D0.C2, &x.D0.C1)
-	// t2 = 2 * g4^2 - 3 * g2 * g1
+	// t2 = 2 * g4² - 3 * g2 * g1
 	t[2].Square(&x.D1.C1).
 		Sub(&t[2], &t[1]).
 		Double(&t[2]).
 		Sub(&t[2], &t[1])
 	// t1 = g3 * g5
 	t[1].Mul(&x.D1.C0, &x.D1.C2)
-	// c_0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1
+	// c₀ = E * (2 * g4² + g3 * g5 - 3 * g2 * g1) + 1
 	t[2].Add(&t[2], &t[1])
 	z.D0.C0.MulByNonResidue(&t[2]).
 		Add(&z.D0.C0, &one)
@@ -284,13 +284,13 @@ func BatchDecompressKarabina(x []E24) []E24 {
 	one.SetOne()
 
 	for i := 0; i < n; i++ {
-		// t0 = g1^2
+		// t0 = g1²
 		t0[i].Square(&x[i].D0.C1)
-		// t1 = 3 * g1^2 - 2 * g2
+		// t1 = 3 * g1² - 2 * g2
 		t1[i].Sub(&t0[i], &x[i].D0.C2).
 			Double(&t1[i]).
 			Add(&t1[i], &t0[i])
-			// t0 = E * g5^2 + t1
+			// t0 = E * g5² + t1
 		t2[i].Square(&x[i].D1.C2)
 		t0[i].MulByNonResidue(&t2[i]).
 			Add(&t0[i], &t1[i])
@@ -307,7 +307,7 @@ func BatchDecompressKarabina(x []E24) []E24 {
 
 		// t1 = g2 * g1
 		t1[i].Mul(&x[i].D0.C2, &x[i].D0.C1)
-		// t2 = 2 * g4^2 - 3 * g2 * g1
+		// t2 = 2 * g4² - 3 * g2 * g1
 		t2[i].Square(&x[i].D1.C1)
 		t2[i].Sub(&t2[i], &t1[i])
 		t2[i].Double(&t2[i])
@@ -315,7 +315,7 @@ func BatchDecompressKarabina(x []E24) []E24 {
 
 		// t1 = g3 * g5
 		t1[i].Mul(&x[i].D1.C0, &x[i].D1.C2)
-		// z0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1
+		// z0 = E * (2 * g4² + g3 * g5 - 3 * g2 * g1) + 1
 		t2[i].Add(&t2[i], &t1[i])
 		x[i].D0.C0.MulByNonResidue(&t2[i]).
 			Add(&x[i].D0.C0, &one)
@@ -328,10 +328,10 @@ func BatchDecompressKarabina(x []E24) []E24 {
 // https://eprint.iacr.org/2009/565.pdf, 3.2
 func (z *E24) CyclotomicSquare(x *E24) *E24 {
 
-	// x=(x0,x1,x2,x3,x4,x5,x6,x7) in E4^6
-	// cyclosquare(x)=(3*x4^2*u + 3*x0^2 - 2*x0,
-	//					3*x2^2*u + 3*x3^2 - 2*x1,
-	//					3*x5^2*u + 3*x1^2 - 2*x2,
+	// x=(x0,x1,x2,x3,x4,x5,x6,x7) in E4⁶
+	// cyclosquare(x)=(3*x4²*u + 3*x0² - 2*x0,
+	//					3*x2²*u + 3*x3² - 2*x1,
+	//					3*x5²*u + 3*x1² - 2*x2,
 	//					6*x1*x5*u + 2*x3,
 	//					6*x0*x4 + 2*x4,
 	//					6*x2*x3 + 2*x5)
@@ -348,9 +348,9 @@ func (z *E24) CyclotomicSquare(x *E24) *E24 {
 	t[5].Square(&x.D0.C1)
 	t[8].Add(&x.D1.C2, &x.D0.C1).Square(&t[8]).Sub(&t[8], &t[4]).Sub(&t[8], &t[5]).MulByNonResidue(&t[8]) // 2*x5*x1*u
 
-	t[0].MulByNonResidue(&t[0]).Add(&t[0], &t[1]) // x4^2*u + x0^2
-	t[2].MulByNonResidue(&t[2]).Add(&t[2], &t[3]) // x2^2*u + x3^2
-	t[4].MulByNonResidue(&t[4]).Add(&t[4], &t[5]) // x5^2*u + x1^2
+	t[0].MulByNonResidue(&t[0]).Add(&t[0], &t[1]) // x4²*u + x0²
+	t[2].MulByNonResidue(&t[2]).Add(&t[2], &t[3]) // x2²*u + x3²
+	t[4].MulByNonResidue(&t[4]).Add(&t[4], &t[5]) // x5²*u + x1²
 
 	z.D0.C0.Sub(&t[0], &x.D0.C0).Double(&z.D0.C0).Add(&z.D0.C0, &t[0])
 	z.D0.C1.Sub(&t[2], &x.D0.C1).Double(&z.D0.C1).Add(&z.D0.C1, &t[2])
@@ -413,9 +413,9 @@ func BatchInvertE24(a []E24) []E24 {
 	return res
 }
 
-// Exp sets z=x**k and returns it
+// Exp sets z=xᵏ (mod q²⁴) and returns it
 // uses 2-bits windowed method
-func (z *E24) Exp(x *E24, k big.Int) *E24 {
+func (z *E24) Exp(x E24, k *big.Int) *E24 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
@@ -424,20 +424,20 @@ func (z *E24) Exp(x *E24, k big.Int) *E24 {
 	if k.Sign() == -1 {
 		// negative k, we invert
 		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		x.Inverse(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var res E24
 	var ops [3]E24
 
 	res.SetOne()
-	ops[0].Set(x)
+	ops[0].Set(&x)
 	ops[1].Square(&ops[0])
 	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
 
@@ -459,37 +459,37 @@ func (z *E24) Exp(x *E24, k big.Int) *E24 {
 	return z
 }
 
-// CyclotomicExp sets z=x**k and returns it
+// CyclotomicExp sets z=xᵏ (mod q²⁴) and returns it
 // uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
 // TODO: use a windowed method
-func (z *E24) CyclotomicExp(x *E24, k big.Int) *E24 {
+func (z *E24) CyclotomicExp(x E24, k *big.Int) *E24 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
 	e := k
 	if k.Sign() == -1 {
-		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// negative k, we invert (=conjugate)
+		// if k < 0: xᵏ (mod q²⁴) == (x⁻¹)ᵏ (mod q²⁴)
+		x.Conjugate(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var res, xInv E24
-	xInv.InverseUnitary(x)
+	xInv.InverseUnitary(&x)
 	res.SetOne()
 	eNAF := make([]int8, e.BitLen()+3)
-	n := ecc.NafDecomposition(&e, eNAF[:])
+	n := ecc.NafDecomposition(e, eNAF[:])
 	for i := n - 1; i >= 0; i-- {
 		res.CyclotomicSquare(&res)
 		if eNAF[i] == 1 {
-			res.Mul(&res, x)
+			res.Mul(&res, &x)
 		} else if eNAF[i] == -1 {
 			res.Mul(&res, &xInv)
 		}
@@ -498,12 +498,12 @@ func (z *E24) CyclotomicExp(x *E24, k big.Int) *E24 {
 	return z
 }
 
-// ExpGLV sets z=x**k and returns it
+// ExpGLV sets z=xᵏ (q²⁴) and returns it
 // uses 2-dimensional GLV with 2-bits windowed method
 // x must be in GT
 // TODO: use 2-NAF
 // TODO: use higher dimensional decomposition
-func (z *E24) ExpGLV(x *E24, k big.Int) *E24 {
+func (z *E24) ExpGLV(x E24, k *big.Int) *E24 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
@@ -511,14 +511,14 @@ func (z *E24) ExpGLV(x *E24, k big.Int) *E24 {
 	e := k
 	if k.Sign() == -1 {
 		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// if k < 0: xᵏ (mod q²⁴) == (x⁻¹)ᵏ (mod q²⁴)
+		x.Conjugate(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var table [15]E24
@@ -528,11 +528,11 @@ func (z *E24) ExpGLV(x *E24, k big.Int) *E24 {
 	res.SetOne()
 
 	// table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x
-	table[0].Set(x)
-	table[3].Frobenius(x)
+	table[0].Set(&x)
+	table[3].Frobenius(&x)
 
-	// split the scalar, modifies +-x, Frob(x) accordingly
-	s := ecc.SplitScalar(&e, &glvBasis)
+	// split the scalar, modifies ±x, Frob(x) accordingly
+	s := ecc.SplitScalar(e, &glvBasis)
 
 	if s[0].Sign() == -1 {
 		s[0].Neg(&s[0])
@@ -793,10 +793,10 @@ func (z *E24) IsInSubGroup() bool {
 
 // CompressTorus GT/E24 element to half its size
 // z must be in the cyclotomic subgroup
-// i.e. z^(p^4-p^2+1)=1
+// i.e. z^(p⁴-p²+1)=1
 // e.g. GT
 // "COMPRESSION IN FINITE FIELDS AND TORUS-BASED CRYPTOGRAPHY", K. RUBIN AND A. SILVERBERG
-// z.C1 == 0 only when z \in {-1,1}
+// z.C1 == 0 only when z ∈ {-1,1}
 func (z *E24) CompressTorus() (E12, error) {
 
 	if z.D1.IsZero() {
diff --git a/ecc/bls24-317/internal/fptower/e24_test.go b/ecc/bls24-317/internal/fptower/e24_test.go
index b5c2dec411..6f235ca829 100644
--- a/ecc/bls24-317/internal/fptower/e24_test.go
+++ b/ecc/bls24-317/internal/fptower/e24_test.go
@@ -422,8 +422,8 @@ func TestE24Ops(t *testing.T) {
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
 
-			c.Exp(a, _e)
-			d.CyclotomicExp(a, _e)
+			c.Exp(*a, &_e)
+			d.CyclotomicExp(*a, &_e)
 
 			return c.Equal(&d)
 		},
@@ -437,7 +437,7 @@ func TestE24Ops(t *testing.T) {
 			q := fp.Modulus()
 			b.Frobenius(a)
 			c.Set(a)
-			c.Exp(&c, *q)
+			c.Exp(c, q)
 			return c.Equal(&b)
 		},
 		genA,
@@ -448,7 +448,7 @@ func TestE24Ops(t *testing.T) {
 			var b, c E24
 			q := fp.Modulus()
 			b.FrobeniusSquare(a)
-			c.Exp(a, *q).Exp(&c, *q)
+			c.Exp(*a, q).Exp(c, q)
 			return c.Equal(&b)
 		},
 		genA,
@@ -459,7 +459,7 @@ func TestE24Ops(t *testing.T) {
 			var b, c E24
 			q := fp.Modulus()
 			b.FrobeniusQuad(a)
-			c.Exp(a, *q).Exp(&c, *q).Exp(&c, *q).Exp(&c, *q)
+			c.Exp(*a, q).Exp(c, q).Exp(c, q).Exp(c, q)
 			return c.Equal(&b)
 		},
 		genA,
diff --git a/ecc/bls24-317/internal/fptower/e4.go b/ecc/bls24-317/internal/fptower/e4.go
index bed1311c0d..63e6b37321 100644
--- a/ecc/bls24-317/internal/fptower/e4.go
+++ b/ecc/bls24-317/internal/fptower/e4.go
@@ -160,7 +160,7 @@ func (z *E4) MulByNonResidue(x *E4) *E4 {
 	return z
 }
 
-// MulByNonResidueInv mul x by (0,1)^{-1}
+// MulByNonResidueInv mul x by (0,1)⁻¹
 func (z *E4) MulByNonResidueInv(x *E4) *E4 {
 	a := x.B1
 	var uInv E2
@@ -216,40 +216,37 @@ func (z *E4) Inverse(x *E4) *E4 {
 	return z
 }
 
-// Exp sets z=x**e and returns it
-func (z *E4) Exp(x *E4, e big.Int) *E4 {
-	if e.IsUint64() && e.Uint64() == 0 {
+// Exp sets z=xᵏ (mod q⁴) and returns it
+func (z *E4) Exp(x E4, k *big.Int) *E4 {
+	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
-	k := e
+	e := k
 	if k.Sign() == -1 {
 		// negative k, we invert
-		// if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12)
-		x.Inverse(x)
+		// if k < 0: xᵏ (mod q⁴) == (x⁻¹)ᵏ (mod q⁴)
+		x.Inverse(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		k = *bigIntPool.Get().(*big.Int)
-		defer bigIntPool.Put(k)
-		k.Neg(&k)
+		e = bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(k)
 	}
 
-	var res E4
-	res.SetOne()
+	z.SetOne()
 	b := e.Bytes()
-	for i := range b {
+	for i := 0; i < len(b); i++ {
 		w := b[i]
-		mask := byte(0x80)
-		for j := 7; j >= 0; j-- {
-			res.Square(&res)
-			if (w&mask)>>j != 0 {
-				res.Mul(&res, x)
+		for j := 0; j < 8; j++ {
+			z.Square(z)
+			if (w & (0b10000000 >> j)) != 0 {
+				z.Mul(z, &x)
 			}
-			mask = mask >> 1
 		}
 	}
-	z.Set(&res)
+
 	return z
 }
 
@@ -300,13 +297,13 @@ func (z *E4) Sqrt(x *E4) *E4 {
 	var exp, one big.Int
 	one.SetUint64(1)
 	exp.Mul(q, q).Sub(&exp, &one).Rsh(&exp, 1)
-	d.Exp(&c, exp)
+	d.Exp(c, &exp)
 	e.Mul(&d, &c).Inverse(&e)
 	f.Mul(&d, &c).Square(&f)
 
 	// computation
 	exp.Rsh(&exp, 1)
-	b.Exp(x, exp)
+	b.Exp(*x, &exp)
 	b.norm(&_b)
 	o.SetOne()
 	if _b.Equal(&o) {
diff --git a/ecc/bls24-317/internal/fptower/e4_test.go b/ecc/bls24-317/internal/fptower/e4_test.go
index 0afe602673..f0f9932b52 100644
--- a/ecc/bls24-317/internal/fptower/e4_test.go
+++ b/ecc/bls24-317/internal/fptower/e4_test.go
@@ -257,7 +257,7 @@ func TestE4Ops(t *testing.T) {
 			var b, c E4
 			q := fp.Modulus()
 			b.Frobenius(a)
-			c.Exp(a, *q)
+			c.Exp(*a, q)
 			return c.Equal(&b)
 		},
 		genA,
diff --git a/ecc/bls24-317/pairing_test.go b/ecc/bls24-317/pairing_test.go
index b4a4595f81..65bd739a75 100644
--- a/ecc/bls24-317/pairing_test.go
+++ b/ecc/bls24-317/pairing_test.go
@@ -77,9 +77,9 @@ func TestPairing(t *testing.T) {
 			e.ToBigIntRegular(&_e)
 
 			var b, c, d GT
-			b.Exp(&a, _e)
-			c.ExpGLV(&a, _e)
-			d.CyclotomicExp(&a, _e)
+			b.Exp(a, &_e)
+			c.ExpGLV(a, &_e)
+			d.CyclotomicExp(a, &_e)
 
 			return b.Equal(&c) && c.Equal(&d)
 		},
@@ -98,7 +98,7 @@ func TestPairing(t *testing.T) {
 				Mul(&a, &b)
 
 			c.Expt(&a).Expt(&c)
-			d.Exp(&a, xGen).Exp(&d, xGen)
+			d.Exp(a, &xGen).Exp(d, &xGen)
 			return c.Equal(&d)
 		},
 		genA,
@@ -125,9 +125,9 @@ func TestPairing(t *testing.T) {
 			resa, _ = Pair([]G1Affine{ag1}, []G2Affine{g2GenAff})
 			resb, _ = Pair([]G1Affine{g1GenAff}, []G2Affine{bg2})
 
-			resab.Exp(&res, ab)
-			resa.Exp(&resa, bbigint)
-			resb.Exp(&resb, abigint)
+			resab.Exp(res, &ab)
+			resa.Exp(resa, &bbigint)
+			resb.Exp(resb, &abigint)
 
 			return resab.Equal(&resa) && resab.Equal(&resb) && !res.Equal(&zero)
 
@@ -394,21 +394,21 @@ func BenchmarkExpGT(b *testing.B) {
 	b.Run("Naive windowed Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.Exp(&a, _e)
+			a.Exp(a, &_e)
 		}
 	})
 
 	b.Run("2-NAF cyclotomic Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.CyclotomicExp(&a, _e)
+			a.CyclotomicExp(a, &_e)
 		}
 	})
 
 	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.ExpGLV(&a, _e)
+			a.ExpGLV(a, &_e)
 		}
 	})
 }
diff --git a/ecc/bn254/internal/fptower/e12.go b/ecc/bn254/internal/fptower/e12.go
index 43f09f1f80..950d60de6c 100644
--- a/ecc/bn254/internal/fptower/e12.go
+++ b/ecc/bn254/internal/fptower/e12.go
@@ -415,9 +415,9 @@ func BatchInvertE12(a []E12) []E12 {
 	return res
 }
 
-// Exp sets z=x**k and returns it
+// Exp sets z=xᵏ (mod q¹²) and returns it
 // uses 2-bits windowed method
-func (z *E12) Exp(x *E12, k big.Int) *E12 {
+func (z *E12) Exp(x E12, k *big.Int) *E12 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
@@ -426,20 +426,20 @@ func (z *E12) Exp(x *E12, k big.Int) *E12 {
 	if k.Sign() == -1 {
 		// negative k, we invert
 		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		x.Inverse(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var res E12
 	var ops [3]E12
 
 	res.SetOne()
-	ops[0].Set(x)
+	ops[0].Set(&x)
 	ops[1].Square(&ops[0])
 	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
 
@@ -461,37 +461,37 @@ func (z *E12) Exp(x *E12, k big.Int) *E12 {
 	return z
 }
 
-// CyclotomicExp sets z=x**k and returns it
+// CyclotomicExp sets z=xᵏ (mod q¹²) and returns it
 // uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
 // TODO: use a windowed method
-func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 {
+func (z *E12) CyclotomicExp(x E12, k *big.Int) *E12 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
 	e := k
 	if k.Sign() == -1 {
-		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// negative k, we invert (=conjugate)
+		// if k < 0: xᵏ (mod q¹²) == (x⁻¹)ᵏ (mod q¹²)
+		x.Conjugate(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var res, xInv E12
-	xInv.InverseUnitary(x)
+	xInv.InverseUnitary(&x)
 	res.SetOne()
 	eNAF := make([]int8, e.BitLen()+3)
-	n := ecc.NafDecomposition(&e, eNAF[:])
+	n := ecc.NafDecomposition(e, eNAF[:])
 	for i := n - 1; i >= 0; i-- {
 		res.CyclotomicSquare(&res)
 		if eNAF[i] == 1 {
-			res.Mul(&res, x)
+			res.Mul(&res, &x)
 		} else if eNAF[i] == -1 {
 			res.Mul(&res, &xInv)
 		}
@@ -500,27 +500,27 @@ func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 {
 	return z
 }
 
-// ExpGLV sets z=x**k and returns it
+// ExpGLV sets z=xᵏ (q¹²) and returns it
 // uses 2-dimensional GLV with 2-bits windowed method
 // x must be in GT
 // TODO: use 2-NAF
 // TODO: use higher dimensional decomposition
-func (z *E12) ExpGLV(x *E12, k big.Int) *E12 {
+func (z *E12) ExpGLV(x E12, k *big.Int) *E12 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
 	e := k
 	if k.Sign() == -1 {
-		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// negative k, we invert (=conjugate)
+		// if k < 0: xᵏ (mod q¹²) == (x⁻¹)ᵏ (mod q¹²)
+		x.Conjugate(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var table [15]E12
@@ -530,11 +530,11 @@ func (z *E12) ExpGLV(x *E12, k big.Int) *E12 {
 	res.SetOne()
 
 	// table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x
-	table[0].Set(x)
-	table[3].Frobenius(x)
+	table[0].Set(&x)
+	table[3].Frobenius(&x)
 
-	// split the scalar, modifies +-x, Frob(x) accordingly
-	s := ecc.SplitScalar(&e, &glvBasis)
+	// split the scalar, modifies ±x, Frob(x) accordingly
+	s := ecc.SplitScalar(e, &glvBasis)
 
 	if s[0].Sign() == -1 {
 		s[0].Neg(&s[0])
diff --git a/ecc/bn254/internal/fptower/e12_pairing.go b/ecc/bn254/internal/fptower/e12_pairing.go
index 9b36b67816..a4abaf510d 100644
--- a/ecc/bn254/internal/fptower/e12_pairing.go
+++ b/ecc/bn254/internal/fptower/e12_pairing.go
@@ -12,7 +12,7 @@ func (z *E12) nSquareCompressed(n int) {
 	}
 }
 
-// Expt set z to xᵗ in E12 and return z (t is the generator of the curve)
+// Expt set z to xᵗ (mod q¹²) and return z (t is the generator of the curve)
 func (z *E12) Expt(x *E12) *E12 {
 	// Expt computation is derived from the addition chain:
 	//
diff --git a/ecc/bn254/internal/fptower/e12_test.go b/ecc/bn254/internal/fptower/e12_test.go
index ad38127038..a503e238cd 100644
--- a/ecc/bn254/internal/fptower/e12_test.go
+++ b/ecc/bn254/internal/fptower/e12_test.go
@@ -391,8 +391,8 @@ func TestE12Ops(t *testing.T) {
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
 
-			c.Exp(a, _e)
-			d.CyclotomicExp(a, _e)
+			c.Exp(*a, &_e)
+			d.CyclotomicExp(*a, &_e)
 
 			return c.Equal(&d)
 		},
@@ -405,7 +405,7 @@ func TestE12Ops(t *testing.T) {
 			var b, c E12
 			q := fp.Modulus()
 			b.Frobenius(a)
-			c.Exp(a, *q)
+			c.Exp(*a, q)
 			return c.Equal(&b)
 		},
 		genA,
@@ -416,7 +416,7 @@ func TestE12Ops(t *testing.T) {
 			var b, c E12
 			q := fp.Modulus()
 			b.FrobeniusSquare(a)
-			c.Exp(a, *q).Exp(&c, *q)
+			c.Exp(*a, q).Exp(c, q)
 			return c.Equal(&b)
 		},
 		genA,
diff --git a/ecc/bn254/internal/fptower/e2.go b/ecc/bn254/internal/fptower/e2.go
index f6bec9b381..3d12b8b7e8 100644
--- a/ecc/bn254/internal/fptower/e2.go
+++ b/ecc/bn254/internal/fptower/e2.go
@@ -170,23 +170,23 @@ func (z *E2) Legendre() int {
 	return n.Legendre()
 }
 
-// Exp sets z=x**e and returns it
-func (z *E2) Exp(x E2, e *big.Int) *E2 {
-	if e.IsUint64() && e.Uint64() == 0 {
+// Exp sets z=xᵏ (mod q²) and returns it
+func (z *E2) Exp(x E2, k *big.Int) *E2 {
+	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
-	k := e
+	e := k
 	if k.Sign() == -1 {
 		// negative k, we invert
-		// if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12)
+		// if k < 0: xᵏ (mod q²) == (x⁻¹)ᵏ (mod q²)
 		x.Inverse(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		k = bigIntPool.Get().(*big.Int)
-		defer bigIntPool.Put(k)
-		k.Neg(k)
+		e = bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(k)
 	}
 
 	z.SetOne()
diff --git a/ecc/bn254/pairing_test.go b/ecc/bn254/pairing_test.go
index e342770160..e29ae8bb76 100644
--- a/ecc/bn254/pairing_test.go
+++ b/ecc/bn254/pairing_test.go
@@ -76,9 +76,9 @@ func TestPairing(t *testing.T) {
 			e.ToBigIntRegular(&_e)
 
 			var b, c, d GT
-			b.Exp(&a, _e)
-			c.ExpGLV(&a, _e)
-			d.CyclotomicExp(&a, _e)
+			b.Exp(a, &_e)
+			c.ExpGLV(a, &_e)
+			d.CyclotomicExp(a, &_e)
 
 			return b.Equal(&c) && c.Equal(&d)
 		},
@@ -97,7 +97,7 @@ func TestPairing(t *testing.T) {
 				Mul(&a, &b)
 
 			c.Expt(&a).Expt(&c)
-			d.Exp(&a, xGen).Exp(&d, xGen)
+			d.Exp(a, &xGen).Exp(d, &xGen)
 			return c.Equal(&d)
 		},
 		genA,
@@ -124,9 +124,9 @@ func TestPairing(t *testing.T) {
 			resa, _ = Pair([]G1Affine{ag1}, []G2Affine{g2GenAff})
 			resb, _ = Pair([]G1Affine{g1GenAff}, []G2Affine{bg2})
 
-			resab.Exp(&res, ab)
-			resa.Exp(&resa, bbigint)
-			resb.Exp(&resb, abigint)
+			resab.Exp(res, &ab)
+			resa.Exp(resa, &bbigint)
+			resb.Exp(resb, &abigint)
 
 			return resab.Equal(&resa) && resab.Equal(&resb) && !res.Equal(&zero)
 
@@ -393,21 +393,21 @@ func BenchmarkExpGT(b *testing.B) {
 	b.Run("Naive windowed Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.Exp(&a, _e)
+			a.Exp(a, &_e)
 		}
 	})
 
 	b.Run("2-NAF cyclotomic Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.CyclotomicExp(&a, _e)
+			a.CyclotomicExp(a, &_e)
 		}
 	})
 
 	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.ExpGLV(&a, _e)
+			a.ExpGLV(a, &_e)
 		}
 	})
 }
diff --git a/ecc/bw6-633/internal/fptower/e6.go b/ecc/bw6-633/internal/fptower/e6.go
index fdc1afa39c..92f54a1dd3 100644
--- a/ecc/bw6-633/internal/fptower/e6.go
+++ b/ecc/bw6-633/internal/fptower/e6.go
@@ -155,25 +155,25 @@ func (z *E6) CyclotomicSquareCompressed(x *E6) *E6 {
 
 	var t [7]fp.Element
 
-	// t0 = g1^2
+	// t0 = g1²
 	t[0].Square(&x.B0.A1)
-	// t1 = g5^2
+	// t1 = g5²
 	t[1].Square(&x.B1.A2)
 	// t5 = g1 + g5
 	t[5].Add(&x.B0.A1, &x.B1.A2)
-	// t2 = (g1 + g5)^2
+	// t2 = (g1 + g5)²
 	t[2].Square(&t[5])
 
-	// t3 = g1^2 + g5^2
+	// t3 = g1² + g5²
 	t[3].Add(&t[0], &t[1])
 	// t5 = 2 * g1 * g5
 	t[5].Sub(&t[2], &t[3])
 
 	// t6 = g3 + g2
 	t[6].Add(&x.B1.A0, &x.B0.A2)
-	// t3 = (g3 + g2)^2
+	// t3 = (g3 + g2)²
 	t[3].Square(&t[6])
-	// t2 = g3^2
+	// t2 = g3²
 	t[2].Square(&x.B1.A0)
 
 	// t6 = 2 * nr * g1 * g5
@@ -184,33 +184,33 @@ func (z *E6) CyclotomicSquareCompressed(x *E6) *E6 {
 	// z3 = 6 * nr * g1 * g5 + 2 * g3
 	z.B1.A0.Add(&t[5], &t[6])
 
-	// t4 = nr * g5^2
+	// t4 = nr * g5²
 	t[4].MulByNonResidue(&t[1])
-	// t5 = nr * g5^2 + g1^2
+	// t5 = nr * g5² + g1²
 	t[5].Add(&t[0], &t[4])
-	// t6 = nr * g5^2 + g1^2 - g2
+	// t6 = nr * g5² + g1² - g2
 	t[6].Sub(&t[5], &x.B0.A2)
 
-	// t1 = g2^2
+	// t1 = g2²
 	t[1].Square(&x.B0.A2)
 
-	// t6 = 2 * nr * g5^2 + 2 * g1^2 - 2*g2
+	// t6 = 2 * nr * g5² + 2 * g1² - 2*g2
 	t[6].Double(&t[6])
-	// z2 = 3 * nr * g5^2 + 3 * g1^2 - 2*g2
+	// z2 = 3 * nr * g5² + 3 * g1² - 2*g2
 	z.B0.A2.Add(&t[6], &t[5])
 
-	// t4 = nr * g2^2
+	// t4 = nr * g2²
 	t[4].MulByNonResidue(&t[1])
-	// t5 = g3^2 + nr * g2^2
+	// t5 = g3² + nr * g2²
 	t[5].Add(&t[2], &t[4])
-	// t6 = g3^2 + nr * g2^2 - g1
+	// t6 = g3² + nr * g2² - g1
 	t[6].Sub(&t[5], &x.B0.A1)
-	// t6 = 2 * g3^2 + 2 * nr * g2^2 - 2 * g1
+	// t6 = 2 * g3² + 2 * nr * g2² - 2 * g1
 	t[6].Double(&t[6])
-	// z1 = 3 * g3^2 + 3 * nr * g2^2 - 2 * g1
+	// z1 = 3 * g3² + 3 * nr * g2² - 2 * g1
 	z.B0.A1.Add(&t[6], &t[5])
 
-	// t0 = g2^2 + g3^2
+	// t0 = g2² + g3²
 	t[0].Add(&t[2], &t[1])
 	// t5 = 2 * g3 * g2
 	t[5].Sub(&t[3], &t[0])
@@ -231,13 +231,13 @@ func (z *E6) Decompress(x *E6) *E6 {
 	var one fp.Element
 	one.SetOne()
 
-	// t0 = g1^2
+	// t0 = g1²
 	t[0].Square(&x.B0.A1)
-	// t1 = 3 * g1^2 - 2 * g2
+	// t1 = 3 * g1² - 2 * g2
 	t[1].Sub(&t[0], &x.B0.A2).
 		Double(&t[1]).
 		Add(&t[1], &t[0])
-		// t0 = E * g5^2 + t1
+		// t0 = E * g5² + t1
 	t[2].Square(&x.B1.A2)
 	t[0].MulByNonResidue(&t[2]).
 		Add(&t[0], &t[1])
@@ -250,14 +250,14 @@ func (z *E6) Decompress(x *E6) *E6 {
 
 	// t1 = g2 * g1
 	t[1].Mul(&x.B0.A2, &x.B0.A1)
-	// t2 = 2 * g4^2 - 3 * g2 * g1
+	// t2 = 2 * g4² - 3 * g2 * g1
 	t[2].Square(&x.B1.A1).
 		Sub(&t[2], &t[1]).
 		Double(&t[2]).
 		Sub(&t[2], &t[1])
 	// t1 = g3 * g5
 	t[1].Mul(&x.B1.A0, &x.B1.A2)
-	// c_0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1
+	// c₀ = E * (2 * g4² + g3 * g5 - 3 * g2 * g1) + 1
 	t[2].Add(&t[2], &t[1])
 	z.B0.A0.MulByNonResidue(&t[2]).
 		Add(&z.B0.A0, &one)
@@ -273,10 +273,10 @@ func (z *E6) Decompress(x *E6) *E6 {
 // Granger-Scott's cyclotomic square
 // https://eprint.iacr.org/2009/565.pdf, 3.2
 func (z *E6) CyclotomicSquare(x *E6) *E6 {
-	// x=(x0,x1,x2,x3,x4,x5,x6,x7) in E3^6
-	// cyclosquare(x)=(3*x4^2*u + 3*x0^2 - 2*x0,
-	//					3*x2^2*u + 3*x3^2 - 2*x1,
-	//					3*x5^2*u + 3*x1^2 - 2*x2,
+	// x=(x0,x1,x2,x3,x4,x5,x6,x7) in E3⁶
+	// cyclosquare(x)=(3*x4²*u + 3*x0² - 2*x0,
+	//					3*x2²*u + 3*x3² - 2*x1,
+	//					3*x5²*u + 3*x1² - 2*x2,
 	//					6*x1*x5*u + 2*x3,
 	//					6*x0*x4 + 2*x4,
 	//					6*x2*x3 + 2*x5)
@@ -293,9 +293,9 @@ func (z *E6) CyclotomicSquare(x *E6) *E6 {
 	t[5].Square(&x.B0.A1)
 	t[8].Add(&x.B1.A2, &x.B0.A1).Square(&t[8]).Sub(&t[8], &t[4]).Sub(&t[8], &t[5]).MulByNonResidue(&t[8]) // 2*x5*x1*u
 
-	t[0].MulByNonResidue(&t[0]).Add(&t[0], &t[1]) // x4^2*u + x0^2
-	t[2].MulByNonResidue(&t[2]).Add(&t[2], &t[3]) // x2^2*u + x3^2
-	t[4].MulByNonResidue(&t[4]).Add(&t[4], &t[5]) // x5^2*u + x1^2
+	t[0].MulByNonResidue(&t[0]).Add(&t[0], &t[1]) // x4²*u + x0²
+	t[2].MulByNonResidue(&t[2]).Add(&t[2], &t[3]) // x2²*u + x3²
+	t[4].MulByNonResidue(&t[4]).Add(&t[4], &t[5]) // x5²*u + x1²
 
 	z.B0.A0.Sub(&t[0], &x.B0.A0).Double(&z.B0.A0).Add(&z.B0.A0, &t[0])
 	z.B0.A1.Sub(&t[2], &x.B0.A1).Double(&z.B0.A1).Add(&z.B0.A1, &t[2])
@@ -358,9 +358,9 @@ func BatchInvertE6(a []E6) []E6 {
 	return res
 }
 
-// Exp sets z=x**k and returns it
+// Exp sets z=xᵏ (mod q⁶) and returns it
 // uses 2-bits windowed method
-func (z *E6) Exp(x *E6, k big.Int) *E6 {
+func (z *E6) Exp(x E6, k *big.Int) *E6 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
@@ -368,21 +368,21 @@ func (z *E6) Exp(x *E6, k big.Int) *E6 {
 	e := k
 	if k.Sign() == -1 {
 		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// if k < 0: xᵏ (mod q⁶) == (x⁻¹)ᵏ (mod q⁶)
+		x.Inverse(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var res E6
 	var ops [3]E6
 
 	res.SetOne()
-	ops[0].Set(x)
+	ops[0].Set(&x)
 	ops[1].Square(&ops[0])
 	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
 
@@ -404,37 +404,37 @@ func (z *E6) Exp(x *E6, k big.Int) *E6 {
 	return z
 }
 
-// CyclotomicExp sets z=x**k and returns it
+// CyclotomicExp sets z=xᵏ (mod q⁶) and returns it
 // uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
 // TODO: use a windowed method
-func (z *E6) CyclotomicExp(x *E6, k big.Int) *E6 {
+func (z *E6) CyclotomicExp(x E6, k *big.Int) *E6 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
 	e := k
 	if k.Sign() == -1 {
-		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// negative k, we invert (=conjugate)
+		// if k < 0: xᵏ (mod q⁶) == (x⁻¹)ᵏ (mod q⁶)
+		x.Conjugate(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var res, xInv E6
-	xInv.InverseUnitary(x)
+	xInv.InverseUnitary(&x)
 	res.SetOne()
 	eNAF := make([]int8, e.BitLen()+3)
-	n := ecc.NafDecomposition(&e, eNAF[:])
+	n := ecc.NafDecomposition(e, eNAF[:])
 	for i := n - 1; i >= 0; i-- {
 		res.CyclotomicSquare(&res)
 		if eNAF[i] == 1 {
-			res.Mul(&res, x)
+			res.Mul(&res, &x)
 		} else if eNAF[i] == -1 {
 			res.Mul(&res, &xInv)
 		}
@@ -443,12 +443,12 @@ func (z *E6) CyclotomicExp(x *E6, k big.Int) *E6 {
 	return z
 }
 
-// ExpGLV sets z=x**k and returns it
+// ExpGLV sets z=xᵏ (q⁶) and returns it
 // uses 2-dimensional GLV with 2-bits windowed method
 // x must be in GT
 // TODO: use 2-NAF
 // TODO: use higher dimensional decomposition
-func (z *E6) ExpGLV(x *E6, k big.Int) *E6 {
+func (z *E6) ExpGLV(x E6, k *big.Int) *E6 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
@@ -456,14 +456,14 @@ func (z *E6) ExpGLV(x *E6, k big.Int) *E6 {
 	e := k
 	if k.Sign() == -1 {
 		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// if k < 0: xᵏ (mod q⁶) == (x⁻¹)ᵏ (mod q⁶)
+		x.Conjugate(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var table [15]E6
@@ -473,11 +473,11 @@ func (z *E6) ExpGLV(x *E6, k big.Int) *E6 {
 	res.SetOne()
 
 	// table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x
-	table[0].Set(x)
-	table[3].Frobenius(x)
+	table[0].Set(&x)
+	table[3].Frobenius(&x)
 
-	// split the scalar, modifies +-x, Frob(x) accordingly
-	s := ecc.SplitScalar(&e, &glvBasis)
+	// split the scalar, modifies ±x, Frob(x) accordingly
+	s := ecc.SplitScalar(e, &glvBasis)
 
 	if s[0].Sign() == -1 {
 		s[0].Neg(&s[0])
@@ -618,9 +618,9 @@ func (z *E6) IsInSubGroup() bool {
 	_a.Frobenius(z)
 	a.CyclotomicSquare(&_a).Mul(&a, &_a) // z^(3p)
 
-	// t(x)-1 = (-10-4x-13x^2+6x^3+7x^4-23x^5+19x^6-12x^7+2x^8+11x^9-7x^10)/3
-	t[0].CyclotomicSquare(z)     // z^2
-	t[1].CyclotomicSquare(&t[0]) // z^4
+	// t(x)-1 = (-10-4x-13x²+6x³+7x⁴-23x⁵+19x⁶-12x⁷+2x⁸+11x⁹-7x¹⁰)/3
+	t[0].CyclotomicSquare(z)     // z²
+	t[1].CyclotomicSquare(&t[0]) // z⁴
 	t[2].CyclotomicSquare(&t[1]).
 		Mul(&t[2], &t[0]).
 		Conjugate(&t[2]) // *z^(-10)
@@ -630,52 +630,52 @@ func (z *E6) IsInSubGroup() bool {
 		Mul(&t[4], &t[2]).
 		Mul(&t[4], z).
 		Expt(&t[4]).
-		Expt(&t[4]) // *z^(-13u^2)
+		Expt(&t[4]) // *z^(-13u²)
 	t[5].Mul(&t[0], &t[1]).
 		Expt(&t[5]).
 		Expt(&t[5]).
-		Expt(&t[5]) // *z^(6u^3)
+		Expt(&t[5]) // *z^(6u³)
 	tmp.Expt(z).
 		Expt(&tmp).
-		Expt(&tmp) // z^(u^3)
+		Expt(&tmp) // z^(u³)
 	t[6].Mul(&tmp, &t[5]).
-		Expt(&t[6]) // *z^(7u^4)
+		Expt(&t[6]) // *z^(7u⁴)
 	t[7].CyclotomicSquare(&t[5]).
-		CyclotomicSquare(&t[7]) // z^(24u^3)
-	tmp.Conjugate(&tmp) // z^(-u^3)
+		CyclotomicSquare(&t[7]) // z^(24u³)
+	tmp.Conjugate(&tmp) // z^(-u³)
 	t[7].Mul(&t[7], &tmp).
 		Conjugate(&t[7]).
 		Expt(&t[7]).
-		Expt(&t[7]) // *z^(-23u^5)
+		Expt(&t[7]) // *z^(-23u⁵)
 	t[8].Conjugate(&t[4]).
 		Expt(&t[8]).
 		Mul(&t[8], &t[5]).
 		Expt(&t[8]).
 		Expt(&t[8]).
-		Expt(&t[8]) // *z^(19u^6)
+		Expt(&t[8]) // *z^(19u⁶)
 	t[9].Conjugate(&t[5]).
 		CyclotomicSquare(&t[9]).
 		Expt(&t[9]).
 		Expt(&t[9]).
 		Expt(&t[9]).
-		Expt(&t[9]) // *z^(-12u^7)
+		Expt(&t[9]) // *z^(-12u⁷)
 	tmp.Expt(&t[7]).
-		Expt(&tmp) // z^(-23u^7)
+		Expt(&tmp) // z^(-23u⁷)
 	t[10].Conjugate(&t[9]).
 		CyclotomicSquare(&t[10]).
-		Mul(&t[10], &tmp) // z^(u^7)
+		Mul(&t[10], &tmp) // z^(u⁷)
 	t[11].Mul(&t[9], &t[10]).
 		Conjugate(&t[11]).
 		Expt(&t[11]).
-		Expt(&t[11]) // *z^(11u^9)
+		Expt(&t[11]) // *z^(11u⁹)
 	t[10].Expt(&t[10]).
-		CyclotomicSquare(&t[10]) // *z^(2u^8)
+		CyclotomicSquare(&t[10]) // *z^(2u⁸)
 	t[12].Conjugate(&t[10]).
 		CyclotomicSquare(&t[12]).
 		Expt(&t[12]).
 		Mul(&t[12], &t[11]).
 		Expt(&t[12]).
-		Conjugate(&t[12]) // *z^(-7u^10)
+		Conjugate(&t[12]) // *z^(-7u¹⁰)
 
 	b.Mul(&t[2], &t[3]).
 		Mul(&b, &t[4]).
@@ -693,10 +693,10 @@ func (z *E6) IsInSubGroup() bool {
 
 // CompressTorus GT/E6 element to half its size
 // z must be in the cyclotomic subgroup
-// i.e. z^(p^4-p^2+1)=1
+// i.e. z^(p⁴-p²+1)=1
 // e.g. GT
 // "COMPRESSION IN FINITE FIELDS AND TORUS-BASED CRYPTOGRAPHY", K. RUBIN AND A. SILVERBERG
-// z.B1 == 0 only when z \in {-1,1}
+// z.B1 == 0 only when z ∈ {-1,1}
 func (z *E6) CompressTorus() (E3, error) {
 
 	if z.B1.IsZero() {
diff --git a/ecc/bw6-633/internal/fptower/e6_test.go b/ecc/bw6-633/internal/fptower/e6_test.go
index dd19286d57..8bda2d0922 100644
--- a/ecc/bw6-633/internal/fptower/e6_test.go
+++ b/ecc/bw6-633/internal/fptower/e6_test.go
@@ -328,8 +328,8 @@ func TestE6Ops(t *testing.T) {
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
 
-			c.Exp(a, _e)
-			d.CyclotomicExp(a, _e)
+			c.Exp(*a, &_e)
+			d.CyclotomicExp(*a, &_e)
 
 			return c.Equal(&d)
 		},
@@ -342,7 +342,7 @@ func TestE6Ops(t *testing.T) {
 			var b, c E6
 			q := fp.Modulus()
 			b.Frobenius(a)
-			c.Exp(a, *q)
+			c.Exp(*a, q)
 			return c.Equal(&b)
 		},
 		genA,
diff --git a/ecc/bw6-633/pairing_test.go b/ecc/bw6-633/pairing_test.go
index a8ed3ab2dc..3a3fd30c80 100644
--- a/ecc/bw6-633/pairing_test.go
+++ b/ecc/bw6-633/pairing_test.go
@@ -78,9 +78,9 @@ func TestPairing(t *testing.T) {
 			e.ToBigIntRegular(&_e)
 
 			var b, c, d GT
-			b.Exp(&a, _e)
-			c.ExpGLV(&a, _e)
-			d.CyclotomicExp(&a, _e)
+			b.Exp(a, &_e)
+			c.ExpGLV(a, &_e)
+			d.CyclotomicExp(a, &_e)
 
 			return b.Equal(&c) && c.Equal(&d)
 		},
@@ -99,7 +99,7 @@ func TestPairing(t *testing.T) {
 				Mul(&a, &b)
 
 			c.Expt(&a).Expt(&c)
-			d.Exp(&a, xGen).Exp(&d, xGen)
+			d.Exp(a, &xGen).Exp(d, &xGen)
 			return c.Equal(&d)
 		},
 		genA,
@@ -126,9 +126,9 @@ func TestPairing(t *testing.T) {
 			resa, _ = Pair([]G1Affine{ag1}, []G2Affine{g2GenAff})
 			resb, _ = Pair([]G1Affine{g1GenAff}, []G2Affine{bg2})
 
-			resab.Exp(&res, ab)
-			resa.Exp(&resa, bbigint)
-			resb.Exp(&resb, abigint)
+			resab.Exp(res, &ab)
+			resa.Exp(resa, &bbigint)
+			resb.Exp(resb, &abigint)
 
 			return resab.Equal(&resa) && resab.Equal(&resb) && !res.Equal(&zero)
 
@@ -396,21 +396,21 @@ func BenchmarkExpGT(b *testing.B) {
 	b.Run("Naive windowed Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.Exp(&a, _e)
+			a.Exp(a, &_e)
 		}
 	})
 
 	b.Run("2-NAF cyclotomic Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.CyclotomicExp(&a, _e)
+			a.CyclotomicExp(a, &_e)
 		}
 	})
 
 	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.ExpGLV(&a, _e)
+			a.ExpGLV(a, &_e)
 		}
 	})
 }
diff --git a/ecc/bw6-756/internal/fptower/e6.go b/ecc/bw6-756/internal/fptower/e6.go
index 16464db219..35d6b82a76 100644
--- a/ecc/bw6-756/internal/fptower/e6.go
+++ b/ecc/bw6-756/internal/fptower/e6.go
@@ -154,25 +154,25 @@ func (z *E6) CyclotomicSquareCompressed(x *E6) *E6 {
 
 	var t [7]fp.Element
 
-	// t0 = g1^2
+	// t0 = g1²
 	t[0].Square(&x.B0.A1)
-	// t1 = g5^2
+	// t1 = g5²
 	t[1].Square(&x.B1.A2)
 	// t5 = g1 + g5
 	t[5].Add(&x.B0.A1, &x.B1.A2)
-	// t2 = (g1 + g5)^2
+	// t2 = (g1 + g5)²
 	t[2].Square(&t[5])
 
-	// t3 = g1^2 + g5^2
+	// t3 = g1² + g5²
 	t[3].Add(&t[0], &t[1])
 	// t5 = 2 * g1 * g5
 	t[5].Sub(&t[2], &t[3])
 
 	// t6 = g3 + g2
 	t[6].Add(&x.B1.A0, &x.B0.A2)
-	// t3 = (g3 + g2)^2
+	// t3 = (g3 + g2)²
 	t[3].Square(&t[6])
-	// t2 = g3^2
+	// t2 = g3²
 	t[2].Square(&x.B1.A0)
 
 	// t6 = 2 * nr * g1 * g5
@@ -183,33 +183,33 @@ func (z *E6) CyclotomicSquareCompressed(x *E6) *E6 {
 	// z3 = 6 * nr * g1 * g5 + 2 * g3
 	z.B1.A0.Add(&t[5], &t[6])
 
-	// t4 = nr * g5^2
+	// t4 = nr * g5²
 	t[4].MulByNonResidue(&t[1])
-	// t5 = nr * g5^2 + g1^2
+	// t5 = nr * g5² + g1²
 	t[5].Add(&t[0], &t[4])
-	// t6 = nr * g5^2 + g1^2 - g2
+	// t6 = nr * g5² + g1² - g2
 	t[6].Sub(&t[5], &x.B0.A2)
 
-	// t1 = g2^2
+	// t1 = g2²
 	t[1].Square(&x.B0.A2)
 
-	// t6 = 2 * nr * g5^2 + 2 * g1^2 - 2*g2
+	// t6 = 2 * nr * g5² + 2 * g1² - 2*g2
 	t[6].Double(&t[6])
-	// z2 = 3 * nr * g5^2 + 3 * g1^2 - 2*g2
+	// z2 = 3 * nr * g5² + 3 * g1² - 2*g2
 	z.B0.A2.Add(&t[6], &t[5])
 
-	// t4 = nr * g2^2
+	// t4 = nr * g2²
 	t[4].MulByNonResidue(&t[1])
-	// t5 = g3^2 + nr * g2^2
+	// t5 = g3² + nr * g2²
 	t[5].Add(&t[2], &t[4])
-	// t6 = g3^2 + nr * g2^2 - g1
+	// t6 = g3² + nr * g2² - g1
 	t[6].Sub(&t[5], &x.B0.A1)
-	// t6 = 2 * g3^2 + 2 * nr * g2^2 - 2 * g1
+	// t6 = 2 * g3² + 2 * nr * g2² - 2 * g1
 	t[6].Double(&t[6])
-	// z1 = 3 * g3^2 + 3 * nr * g2^2 - 2 * g1
+	// z1 = 3 * g3² + 3 * nr * g2² - 2 * g1
 	z.B0.A1.Add(&t[6], &t[5])
 
-	// t0 = g2^2 + g3^2
+	// t0 = g2² + g3²
 	t[0].Add(&t[2], &t[1])
 	// t5 = 2 * g3 * g2
 	t[5].Sub(&t[3], &t[0])
@@ -230,13 +230,13 @@ func (z *E6) Decompress(x *E6) *E6 {
 	var one fp.Element
 	one.SetOne()
 
-	// t0 = g1^2
+	// t0 = g1²
 	t[0].Square(&x.B0.A1)
-	// t1 = 3 * g1^2 - 2 * g2
+	// t1 = 3 * g1² - 2 * g2
 	t[1].Sub(&t[0], &x.B0.A2).
 		Double(&t[1]).
 		Add(&t[1], &t[0])
-		// t0 = E * g5^2 + t1
+		// t0 = E * g5² + t1
 	t[2].Square(&x.B1.A2)
 	t[0].MulByNonResidue(&t[2]).
 		Add(&t[0], &t[1])
@@ -249,14 +249,14 @@ func (z *E6) Decompress(x *E6) *E6 {
 
 	// t1 = g2 * g1
 	t[1].Mul(&x.B0.A2, &x.B0.A1)
-	// t2 = 2 * g4^2 - 3 * g2 * g1
+	// t2 = 2 * g4² - 3 * g2 * g1
 	t[2].Square(&x.B1.A1).
 		Sub(&t[2], &t[1]).
 		Double(&t[2]).
 		Sub(&t[2], &t[1])
 	// t1 = g3 * g5
 	t[1].Mul(&x.B1.A0, &x.B1.A2)
-	// c_0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1
+	// c₀ = E * (2 * g4² + g3 * g5 - 3 * g2 * g1) + 1
 	t[2].Add(&t[2], &t[1])
 	z.B0.A0.MulByNonResidue(&t[2]).
 		Add(&z.B0.A0, &one)
@@ -272,10 +272,10 @@ func (z *E6) Decompress(x *E6) *E6 {
 // Granger-Scott's cyclotomic square
 // https://eprint.iacr.org/2009/565.pdf, 3.2
 func (z *E6) CyclotomicSquare(x *E6) *E6 {
-	// x=(x0,x1,x2,x3,x4,x5,x6,x7) in E3^6
-	// cyclosquare(x)=(3*x4^2*u + 3*x0^2 - 2*x0,
-	//					3*x2^2*u + 3*x3^2 - 2*x1,
-	//					3*x5^2*u + 3*x1^2 - 2*x2,
+	// x=(x0,x1,x2,x3,x4,x5,x6,x7) in E3⁶
+	// cyclosquare(x)=(3*x4²*u + 3*x0² - 2*x0,
+	//					3*x2²*u + 3*x3² - 2*x1,
+	//					3*x5²*u + 3*x1² - 2*x2,
 	//					6*x1*x5*u + 2*x3,
 	//					6*x0*x4 + 2*x4,
 	//					6*x2*x3 + 2*x5)
@@ -292,9 +292,9 @@ func (z *E6) CyclotomicSquare(x *E6) *E6 {
 	t[5].Square(&x.B0.A1)
 	t[8].Add(&x.B1.A2, &x.B0.A1).Square(&t[8]).Sub(&t[8], &t[4]).Sub(&t[8], &t[5]).MulByNonResidue(&t[8]) // 2*x5*x1*u
 
-	t[0].MulByNonResidue(&t[0]).Add(&t[0], &t[1]) // x4^2*u + x0^2
-	t[2].MulByNonResidue(&t[2]).Add(&t[2], &t[3]) // x2^2*u + x3^2
-	t[4].MulByNonResidue(&t[4]).Add(&t[4], &t[5]) // x5^2*u + x1^2
+	t[0].MulByNonResidue(&t[0]).Add(&t[0], &t[1]) // x4²*u + x0²
+	t[2].MulByNonResidue(&t[2]).Add(&t[2], &t[3]) // x2²*u + x3²
+	t[4].MulByNonResidue(&t[4]).Add(&t[4], &t[5]) // x5²*u + x1²
 
 	z.B0.A0.Sub(&t[0], &x.B0.A0).Double(&z.B0.A0).Add(&z.B0.A0, &t[0])
 	z.B0.A1.Sub(&t[2], &x.B0.A1).Double(&z.B0.A1).Add(&z.B0.A1, &t[2])
@@ -357,9 +357,9 @@ func BatchInvertE6(a []E6) []E6 {
 	return res
 }
 
-// Exp sets z=x**k and returns it
+// Exp sets z=xᵏ (mod q⁶) and returns it
 // uses 2-bits windowed method
-func (z *E6) Exp(x *E6, k big.Int) *E6 {
+func (z *E6) Exp(x E6, k *big.Int) *E6 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
@@ -367,21 +367,21 @@ func (z *E6) Exp(x *E6, k big.Int) *E6 {
 	e := k
 	if k.Sign() == -1 {
 		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// if k < 0: xᵏ (mod q⁶) == (x⁻¹)ᵏ (mod q⁶)
+		x.Inverse(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var res E6
 	var ops [3]E6
 
 	res.SetOne()
-	ops[0].Set(x)
+	ops[0].Set(&x)
 	ops[1].Square(&ops[0])
 	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
 
@@ -403,37 +403,37 @@ func (z *E6) Exp(x *E6, k big.Int) *E6 {
 	return z
 }
 
-// CyclotomicExp sets z=x**k and returns it
+// CyclotomicExp sets z=xᵏ (mod q⁶) and returns it
 // uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
 // TODO: use a windowed method
-func (z *E6) CyclotomicExp(x *E6, k big.Int) *E6 {
+func (z *E6) CyclotomicExp(x E6, k *big.Int) *E6 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
 	e := k
 	if k.Sign() == -1 {
-		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// negative k, we invert (=conjugate)
+		// if k < 0: xᵏ (mod q⁶) == (x⁻¹)ᵏ (mod q⁶)
+		x.Conjugate(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var res, xInv E6
-	xInv.InverseUnitary(x)
+	xInv.InverseUnitary(&x)
 	res.SetOne()
 	eNAF := make([]int8, e.BitLen()+3)
-	n := ecc.NafDecomposition(&e, eNAF[:])
+	n := ecc.NafDecomposition(e, eNAF[:])
 	for i := n - 1; i >= 0; i-- {
 		res.CyclotomicSquare(&res)
 		if eNAF[i] == 1 {
-			res.Mul(&res, x)
+			res.Mul(&res, &x)
 		} else if eNAF[i] == -1 {
 			res.Mul(&res, &xInv)
 		}
@@ -442,12 +442,12 @@ func (z *E6) CyclotomicExp(x *E6, k big.Int) *E6 {
 	return z
 }
 
-// ExpGLV sets z=x**k and returns it
+// ExpGLV sets z=xᵏ (q⁶) and returns it
 // uses 2-dimensional GLV with 2-bits windowed method
 // x must be in GT
 // TODO: use 2-NAF
 // TODO: use higher dimensional decomposition
-func (z *E6) ExpGLV(x *E6, k big.Int) *E6 {
+func (z *E6) ExpGLV(x E6, k *big.Int) *E6 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
@@ -455,14 +455,14 @@ func (z *E6) ExpGLV(x *E6, k big.Int) *E6 {
 	e := k
 	if k.Sign() == -1 {
 		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// if k < 0: xᵏ (mod q⁶) == (x⁻¹)ᵏ (mod q⁶)
+		x.Conjugate(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var table [15]E6
@@ -472,11 +472,11 @@ func (z *E6) ExpGLV(x *E6, k big.Int) *E6 {
 	res.SetOne()
 
 	// table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x
-	table[0].Set(x)
-	table[3].Frobenius(x)
+	table[0].Set(&x)
+	table[3].Frobenius(&x)
 
-	// split the scalar, modifies +-x, Frob(x) accordingly
-	s := ecc.SplitScalar(&e, &glvBasis)
+	// split the scalar, modifies ±x, Frob(x) accordingly
+	s := ecc.SplitScalar(e, &glvBasis)
 
 	if s[0].Sign() == -1 {
 		s[0].Neg(&s[0])
@@ -604,16 +604,16 @@ func (z *E6) SetBytes(e []byte) error {
 func (z *E6) IsInSubGroup() bool {
 	var one, _z E6
 	one.SetOne()
-	_z.Exp(z, *fr.Modulus())
+	_z.Exp(*z, fr.Modulus())
 	return _z.Equal(&one)
 }
 
 // CompressTorus GT/E6 element to half its size
 // z must be in the cyclotomic subgroup
-// i.e. z^(p^4-p^2+1)=1
+// i.e. z^(p⁴-p²+1)=1
 // e.g. GT
 // "COMPRESSION IN FINITE FIELDS AND TORUS-BASED CRYPTOGRAPHY", K. RUBIN AND A. SILVERBERG
-// z.B1 == 0 only when z \in {-1,1}
+// z.B1 == 0 only when z ∈ {-1,1}
 func (z *E6) CompressTorus() (E3, error) {
 
 	if z.B1.IsZero() {
diff --git a/ecc/bw6-756/internal/fptower/e6_test.go b/ecc/bw6-756/internal/fptower/e6_test.go
index b74c1943bc..58048b0e8b 100644
--- a/ecc/bw6-756/internal/fptower/e6_test.go
+++ b/ecc/bw6-756/internal/fptower/e6_test.go
@@ -317,7 +317,7 @@ func TestE6Ops(t *testing.T) {
 			var b, c E6
 			q := fp.Modulus()
 			b.Frobenius(a)
-			c.Exp(a, *q)
+			c.Exp(*a, q)
 			return c.Equal(&b)
 		},
 		genA,
diff --git a/ecc/bw6-756/pairing_test.go b/ecc/bw6-756/pairing_test.go
index 466a7c797d..ef0b6f670d 100644
--- a/ecc/bw6-756/pairing_test.go
+++ b/ecc/bw6-756/pairing_test.go
@@ -77,9 +77,9 @@ func TestPairing(t *testing.T) {
 			e.ToBigIntRegular(&_e)
 
 			var b, c, d GT
-			b.Exp(&a, _e)
-			c.ExpGLV(&a, _e)
-			d.CyclotomicExp(&a, _e)
+			b.Exp(a, &_e)
+			c.ExpGLV(a, &_e)
+			d.CyclotomicExp(a, &_e)
 
 			return b.Equal(&c) && c.Equal(&d)
 		},
@@ -98,7 +98,7 @@ func TestPairing(t *testing.T) {
 				Mul(&a, &b)
 
 			c.Expt(&a).Expt(&c)
-			d.Exp(&a, xGen).Exp(&d, xGen)
+			d.Exp(a, &xGen).Exp(d, &xGen)
 			return c.Equal(&d)
 		},
 		genA,
@@ -125,9 +125,9 @@ func TestPairing(t *testing.T) {
 			resa, _ = Pair([]G1Affine{ag1}, []G2Affine{g2GenAff})
 			resb, _ = Pair([]G1Affine{g1GenAff}, []G2Affine{bg2})
 
-			resab.Exp(&res, ab)
-			resa.Exp(&resa, bbigint)
-			resb.Exp(&resb, abigint)
+			resab.Exp(res, &ab)
+			resa.Exp(resa, &bbigint)
+			resb.Exp(resb, &abigint)
 
 			return resab.Equal(&resa) && resab.Equal(&resb) && !res.Equal(&zero)
 
@@ -394,21 +394,21 @@ func BenchmarkExpGT(b *testing.B) {
 	b.Run("Naive windowed Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.Exp(&a, _e)
+			a.Exp(a, &_e)
 		}
 	})
 
 	b.Run("2-NAF cyclotomic Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.CyclotomicExp(&a, _e)
+			a.CyclotomicExp(a, &_e)
 		}
 	})
 
 	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.ExpGLV(&a, _e)
+			a.ExpGLV(a, &_e)
 		}
 	})
 }
diff --git a/ecc/bw6-761/internal/fptower/e6.go b/ecc/bw6-761/internal/fptower/e6.go
index 0f914d18d0..f211c4a0c7 100644
--- a/ecc/bw6-761/internal/fptower/e6.go
+++ b/ecc/bw6-761/internal/fptower/e6.go
@@ -154,25 +154,25 @@ func (z *E6) CyclotomicSquareCompressed(x *E6) *E6 {
 
 	var t [7]fp.Element
 
-	// t0 = g1^2
+	// t0 = g1²
 	t[0].Square(&x.B0.A1)
-	// t1 = g5^2
+	// t1 = g5²
 	t[1].Square(&x.B1.A2)
 	// t5 = g1 + g5
 	t[5].Add(&x.B0.A1, &x.B1.A2)
-	// t2 = (g1 + g5)^2
+	// t2 = (g1 + g5)²
 	t[2].Square(&t[5])
 
-	// t3 = g1^2 + g5^2
+	// t3 = g1² + g5²
 	t[3].Add(&t[0], &t[1])
 	// t5 = 2 * g1 * g5
 	t[5].Sub(&t[2], &t[3])
 
 	// t6 = g3 + g2
 	t[6].Add(&x.B1.A0, &x.B0.A2)
-	// t3 = (g3 + g2)^2
+	// t3 = (g3 + g2)²
 	t[3].Square(&t[6])
-	// t2 = g3^2
+	// t2 = g3²
 	t[2].Square(&x.B1.A0)
 
 	// t6 = 2 * nr * g1 * g5
@@ -183,33 +183,33 @@ func (z *E6) CyclotomicSquareCompressed(x *E6) *E6 {
 	// z3 = 6 * nr * g1 * g5 + 2 * g3
 	z.B1.A0.Add(&t[5], &t[6])
 
-	// t4 = nr * g5^2
+	// t4 = nr * g5²
 	t[4].MulByNonResidue(&t[1])
-	// t5 = nr * g5^2 + g1^2
+	// t5 = nr * g5² + g1²
 	t[5].Add(&t[0], &t[4])
-	// t6 = nr * g5^2 + g1^2 - g2
+	// t6 = nr * g5² + g1² - g2
 	t[6].Sub(&t[5], &x.B0.A2)
 
-	// t1 = g2^2
+	// t1 = g2²
 	t[1].Square(&x.B0.A2)
 
-	// t6 = 2 * nr * g5^2 + 2 * g1^2 - 2*g2
+	// t6 = 2 * nr * g5² + 2 * g1² - 2*g2
 	t[6].Double(&t[6])
-	// z2 = 3 * nr * g5^2 + 3 * g1^2 - 2*g2
+	// z2 = 3 * nr * g5² + 3 * g1² - 2*g2
 	z.B0.A2.Add(&t[6], &t[5])
 
-	// t4 = nr * g2^2
+	// t4 = nr * g2²
 	t[4].MulByNonResidue(&t[1])
-	// t5 = g3^2 + nr * g2^2
+	// t5 = g3² + nr * g2²
 	t[5].Add(&t[2], &t[4])
-	// t6 = g3^2 + nr * g2^2 - g1
+	// t6 = g3² + nr * g2² - g1
 	t[6].Sub(&t[5], &x.B0.A1)
-	// t6 = 2 * g3^2 + 2 * nr * g2^2 - 2 * g1
+	// t6 = 2 * g3² + 2 * nr * g2² - 2 * g1
 	t[6].Double(&t[6])
-	// z1 = 3 * g3^2 + 3 * nr * g2^2 - 2 * g1
+	// z1 = 3 * g3² + 3 * nr * g2² - 2 * g1
 	z.B0.A1.Add(&t[6], &t[5])
 
-	// t0 = g2^2 + g3^2
+	// t0 = g2² + g3²
 	t[0].Add(&t[2], &t[1])
 	// t5 = 2 * g3 * g2
 	t[5].Sub(&t[3], &t[0])
@@ -230,13 +230,13 @@ func (z *E6) Decompress(x *E6) *E6 {
 	var one fp.Element
 	one.SetOne()
 
-	// t0 = g1^2
+	// t0 = g1²
 	t[0].Square(&x.B0.A1)
-	// t1 = 3 * g1^2 - 2 * g2
+	// t1 = 3 * g1² - 2 * g2
 	t[1].Sub(&t[0], &x.B0.A2).
 		Double(&t[1]).
 		Add(&t[1], &t[0])
-		// t0 = E * g5^2 + t1
+		// t0 = E * g5² + t1
 	t[2].Square(&x.B1.A2)
 	t[0].MulByNonResidue(&t[2]).
 		Add(&t[0], &t[1])
@@ -249,14 +249,14 @@ func (z *E6) Decompress(x *E6) *E6 {
 
 	// t1 = g2 * g1
 	t[1].Mul(&x.B0.A2, &x.B0.A1)
-	// t2 = 2 * g4^2 - 3 * g2 * g1
+	// t2 = 2 * g4² - 3 * g2 * g1
 	t[2].Square(&x.B1.A1).
 		Sub(&t[2], &t[1]).
 		Double(&t[2]).
 		Sub(&t[2], &t[1])
 	// t1 = g3 * g5
 	t[1].Mul(&x.B1.A0, &x.B1.A2)
-	// c_0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1
+	// c₀ = E * (2 * g4² + g3 * g5 - 3 * g2 * g1) + 1
 	t[2].Add(&t[2], &t[1])
 	z.B0.A0.MulByNonResidue(&t[2]).
 		Add(&z.B0.A0, &one)
@@ -272,10 +272,10 @@ func (z *E6) Decompress(x *E6) *E6 {
 // Granger-Scott's cyclotomic square
 // https://eprint.iacr.org/2009/565.pdf, 3.2
 func (z *E6) CyclotomicSquare(x *E6) *E6 {
-	// x=(x0,x1,x2,x3,x4,x5,x6,x7) in E3^6
-	// cyclosquare(x)=(3*x4^2*u + 3*x0^2 - 2*x0,
-	//					3*x2^2*u + 3*x3^2 - 2*x1,
-	//					3*x5^2*u + 3*x1^2 - 2*x2,
+	// x=(x0,x1,x2,x3,x4,x5,x6,x7) in E3⁶
+	// cyclosquare(x)=(3*x4²*u + 3*x0² - 2*x0,
+	//					3*x2²*u + 3*x3² - 2*x1,
+	//					3*x5²*u + 3*x1² - 2*x2,
 	//					6*x1*x5*u + 2*x3,
 	//					6*x0*x4 + 2*x4,
 	//					6*x2*x3 + 2*x5)
@@ -292,9 +292,9 @@ func (z *E6) CyclotomicSquare(x *E6) *E6 {
 	t[5].Square(&x.B0.A1)
 	t[8].Add(&x.B1.A2, &x.B0.A1).Square(&t[8]).Sub(&t[8], &t[4]).Sub(&t[8], &t[5]).MulByNonResidue(&t[8]) // 2*x5*x1*u
 
-	t[0].MulByNonResidue(&t[0]).Add(&t[0], &t[1]) // x4^2*u + x0^2
-	t[2].MulByNonResidue(&t[2]).Add(&t[2], &t[3]) // x2^2*u + x3^2
-	t[4].MulByNonResidue(&t[4]).Add(&t[4], &t[5]) // x5^2*u + x1^2
+	t[0].MulByNonResidue(&t[0]).Add(&t[0], &t[1]) // x4²*u + x0²
+	t[2].MulByNonResidue(&t[2]).Add(&t[2], &t[3]) // x2²*u + x3²
+	t[4].MulByNonResidue(&t[4]).Add(&t[4], &t[5]) // x5²*u + x1²
 
 	z.B0.A0.Sub(&t[0], &x.B0.A0).Double(&z.B0.A0).Add(&z.B0.A0, &t[0])
 	z.B0.A1.Sub(&t[2], &x.B0.A1).Double(&z.B0.A1).Add(&z.B0.A1, &t[2])
@@ -357,9 +357,9 @@ func BatchInvertE6(a []E6) []E6 {
 	return res
 }
 
-// Exp sets z=x**k and returns it
+// Exp sets z=xᵏ (mod q⁶) and returns it
 // uses 2-bits windowed method
-func (z *E6) Exp(x *E6, k big.Int) *E6 {
+func (z *E6) Exp(x E6, k *big.Int) *E6 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
@@ -367,21 +367,21 @@ func (z *E6) Exp(x *E6, k big.Int) *E6 {
 	e := k
 	if k.Sign() == -1 {
 		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// if k < 0: xᵏ (mod q⁶) == (x⁻¹)ᵏ (mod q⁶)
+		x.Inverse(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var res E6
 	var ops [3]E6
 
 	res.SetOne()
-	ops[0].Set(x)
+	ops[0].Set(&x)
 	ops[1].Square(&ops[0])
 	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
 
@@ -403,37 +403,37 @@ func (z *E6) Exp(x *E6, k big.Int) *E6 {
 	return z
 }
 
-// CyclotomicExp sets z=x**k and returns it
+// CyclotomicExp sets z=xᵏ (mod q⁶) and returns it
 // uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
 // TODO: use a windowed method
-func (z *E6) CyclotomicExp(x *E6, k big.Int) *E6 {
+func (z *E6) CyclotomicExp(x E6, k *big.Int) *E6 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
 	e := k
 	if k.Sign() == -1 {
-		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// negative k, we invert (=conjugate)
+		// if k < 0: xᵏ (mod q⁶) == (x⁻¹)ᵏ (mod q⁶)
+		x.Conjugate(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var res, xInv E6
-	xInv.InverseUnitary(x)
+	xInv.InverseUnitary(&x)
 	res.SetOne()
 	eNAF := make([]int8, e.BitLen()+3)
-	n := ecc.NafDecomposition(&e, eNAF[:])
+	n := ecc.NafDecomposition(e, eNAF[:])
 	for i := n - 1; i >= 0; i-- {
 		res.CyclotomicSquare(&res)
 		if eNAF[i] == 1 {
-			res.Mul(&res, x)
+			res.Mul(&res, &x)
 		} else if eNAF[i] == -1 {
 			res.Mul(&res, &xInv)
 		}
@@ -442,12 +442,12 @@ func (z *E6) CyclotomicExp(x *E6, k big.Int) *E6 {
 	return z
 }
 
-// ExpGLV sets z=x**k and returns it
+// ExpGLV sets z=xᵏ (q⁶) and returns it
 // uses 2-dimensional GLV with 2-bits windowed method
 // x must be in GT
 // TODO: use 2-NAF
 // TODO: use higher dimensional decomposition
-func (z *E6) ExpGLV(x *E6, k big.Int) *E6 {
+func (z *E6) ExpGLV(x E6, k *big.Int) *E6 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
@@ -455,14 +455,14 @@ func (z *E6) ExpGLV(x *E6, k big.Int) *E6 {
 	e := k
 	if k.Sign() == -1 {
 		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// if k < 0: xᵏ (mod q⁶) == (x⁻¹)ᵏ (mod q⁶)
+		x.Conjugate(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var table [15]E6
@@ -472,11 +472,11 @@ func (z *E6) ExpGLV(x *E6, k big.Int) *E6 {
 	res.SetOne()
 
 	// table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x
-	table[0].Set(x)
-	table[3].Frobenius(x)
+	table[0].Set(&x)
+	table[3].Frobenius(&x)
 
-	// split the scalar, modifies +-x, Frob(x) accordingly
-	s := ecc.SplitScalar(&e, &glvBasis)
+	// split the scalar, modifies ±x, Frob(x) accordingly
+	s := ecc.SplitScalar(e, &glvBasis)
 
 	if s[0].Sign() == -1 {
 		s[0].Neg(&s[0])
@@ -616,13 +616,13 @@ func (z *E6) IsInSubGroup() bool {
 	_a.Frobenius(z)
 	a.CyclotomicSquare(&_a).Mul(&a, &_a) // z^(3p)
 
-	// t(x)-1 = (13x^6 − 23x^5 − 9x^4 + 35x^3 + 10x + 19)/3
-	t[0].CyclotomicSquare(z) // z^2
+	// t(x)-1 = (13x⁶ − 23x⁵ − 9x⁴ + 35x³ + 10x + 19)/3
+	t[0].CyclotomicSquare(z) // z²
 	t[1].CyclotomicSquare(&t[0]).
-		CyclotomicSquare(&t[1]) // z^8
+		CyclotomicSquare(&t[1]) // z⁸
 	t[2].CyclotomicSquare(&t[1]).
 		Mul(&t[2], &t[0]).
-		Mul(&t[2], z) // z^19*
+		Mul(&t[2], z) // z¹⁹*
 	t[3].Mul(&t[0], &t[1]).
 		Expt(&t[3]) // z^(10u)*
 	t[4].CyclotomicSquare(&t[3]).
@@ -632,25 +632,25 @@ func (z *E6) IsInSubGroup() bool {
 		Expt(&t[0]) // z^(5u)
 	t[4].Mul(&t[4], &t[0]).
 		Expt(&t[4]).
-		Expt(&t[4]) // z^(35u^3)*
+		Expt(&t[4]) // z^(35u³)*
 	t[1].Mul(&t[1], z).
 		Expt(&t[1]).
 		Expt(&t[1]).
 		Expt(&t[1]).
 		Expt(&t[1]).
-		Conjugate(&t[1]) // z^(-9u^4)*
+		Conjugate(&t[1]) // z^(-9u⁴)*
 	t[0].Expt(&t[0]).
 		Expt(&t[0]).
 		Expt(&t[0]).
-		Conjugate(&t[0]) // z^(-5u^4)
+		Conjugate(&t[0]) // z^(-5u⁴)
 	t[5].CyclotomicSquare(&t[1]).
 		Mul(&t[5], &t[0]).
-		Expt(&t[5]) // z^(-23u^5)*
+		Expt(&t[5]) // z^(-23u⁵)*
 	tmp.CyclotomicSquare(&t[1]).
-		Conjugate(&tmp) // z^(18u^4)
+		Conjugate(&tmp) // z^(18u⁴)
 	t[0].Mul(&t[0], &tmp).
 		Expt(&t[0]).
-		Expt(&t[0]) // z^(13u^6)*
+		Expt(&t[0]) // z^(13u⁶)*
 
 	b.Mul(&t[2], &t[3]).
 		Mul(&b, &t[4]).
@@ -663,10 +663,10 @@ func (z *E6) IsInSubGroup() bool {
 
 // CompressTorus GT/E6 element to half its size
 // z must be in the cyclotomic subgroup
-// i.e. z^(p^4-p^2+1)=1
+// i.e. z^(p⁴-p²+1)=1
 // e.g. GT
 // "COMPRESSION IN FINITE FIELDS AND TORUS-BASED CRYPTOGRAPHY", K. RUBIN AND A. SILVERBERG
-// z.B1 == 0 only when z \in {-1,1}
+// z.B1 == 0 only when z ∈ {-1,1}
 func (z *E6) CompressTorus() (E3, error) {
 
 	if z.B1.IsZero() {
diff --git a/ecc/bw6-761/internal/fptower/e6_test.go b/ecc/bw6-761/internal/fptower/e6_test.go
index 512ebc9438..4841bb4564 100644
--- a/ecc/bw6-761/internal/fptower/e6_test.go
+++ b/ecc/bw6-761/internal/fptower/e6_test.go
@@ -328,8 +328,8 @@ func TestE6Ops(t *testing.T) {
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
 
-			c.Exp(a, _e)
-			d.CyclotomicExp(a, _e)
+			c.Exp(*a, &_e)
+			d.CyclotomicExp(*a, &_e)
 
 			return c.Equal(&d)
 		},
@@ -342,7 +342,7 @@ func TestE6Ops(t *testing.T) {
 			var b, c E6
 			q := fp.Modulus()
 			b.Frobenius(a)
-			c.Exp(a, *q)
+			c.Exp(*a, q)
 			return c.Equal(&b)
 		},
 		genA,
diff --git a/ecc/bw6-761/pairing_test.go b/ecc/bw6-761/pairing_test.go
index 323deef40f..e92c59fa99 100644
--- a/ecc/bw6-761/pairing_test.go
+++ b/ecc/bw6-761/pairing_test.go
@@ -78,9 +78,9 @@ func TestPairing(t *testing.T) {
 			e.ToBigIntRegular(&_e)
 
 			var b, c, d GT
-			b.Exp(&a, _e)
-			c.ExpGLV(&a, _e)
-			d.CyclotomicExp(&a, _e)
+			b.Exp(a, &_e)
+			c.ExpGLV(a, &_e)
+			d.CyclotomicExp(a, &_e)
 
 			return b.Equal(&c) && c.Equal(&d)
 		},
@@ -99,7 +99,7 @@ func TestPairing(t *testing.T) {
 				Mul(&a, &b)
 
 			c.Expt(&a).Expt(&c)
-			d.Exp(&a, xGen).Exp(&d, xGen)
+			d.Exp(a, &xGen).Exp(d, &xGen)
 			return c.Equal(&d)
 		},
 		genA,
@@ -126,9 +126,9 @@ func TestPairing(t *testing.T) {
 			resa, _ = Pair([]G1Affine{ag1}, []G2Affine{g2GenAff})
 			resb, _ = Pair([]G1Affine{g1GenAff}, []G2Affine{bg2})
 
-			resab.Exp(&res, ab)
-			resa.Exp(&resa, bbigint)
-			resb.Exp(&resb, abigint)
+			resab.Exp(res, &ab)
+			resa.Exp(resa, &bbigint)
+			resb.Exp(resb, &abigint)
 
 			return resab.Equal(&resa) && resab.Equal(&resb) && !res.Equal(&zero)
 
@@ -396,21 +396,21 @@ func BenchmarkExpGT(b *testing.B) {
 	b.Run("Naive windowed Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.Exp(&a, _e)
+			a.Exp(a, &_e)
 		}
 	})
 
 	b.Run("2-NAF cyclotomic Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.CyclotomicExp(&a, _e)
+			a.CyclotomicExp(a, &_e)
 		}
 	})
 
 	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.ExpGLV(&a, _e)
+			a.ExpGLV(a, &_e)
 		}
 	})
 }
diff --git a/internal/generator/pairing/template/tests/pairing.go.tmpl b/internal/generator/pairing/template/tests/pairing.go.tmpl
index d77d7e45c8..5ea344b836 100644
--- a/internal/generator/pairing/template/tests/pairing.go.tmpl
+++ b/internal/generator/pairing/template/tests/pairing.go.tmpl
@@ -69,9 +69,9 @@ func TestPairing(t *testing.T) {
 			e.ToBigIntRegular(&_e)
 
 			var b, c, d GT
-			b.Exp(&a, _e)
-			c.ExpGLV(&a, _e)
-			d.CyclotomicExp(&a, _e)
+			b.Exp(a, &_e)
+			c.ExpGLV(a, &_e)
+			d.CyclotomicExp(a, &_e)
 
 			return b.Equal(&c) && c.Equal(&d)
 		},
@@ -95,7 +95,7 @@ func TestPairing(t *testing.T) {
                 Mul(&a, &b)
 
 			c.Expt(&a).Expt(&c)
-			d.Exp(&a, xGen).Exp(&d, xGen)
+			d.Exp(a, &xGen).Exp(d, &xGen)
 			return c.Equal(&d)
 		},
 		genA,
@@ -122,9 +122,9 @@ func TestPairing(t *testing.T) {
 			resa, _ = Pair([]G1Affine{ag1}, []G2Affine{g2GenAff})
 			resb, _ = Pair([]G1Affine{g1GenAff}, []G2Affine{bg2})
 
-			resab.Exp(&res, ab)
-			resa.Exp(&resa, bbigint)
-			resb.Exp(&resb, abigint)
+			resab.Exp(res, &ab)
+			resa.Exp(resa, &bbigint)
+			resb.Exp(resb, &abigint)
 
 			return resab.Equal(&resa) && resab.Equal(&resb) && !res.Equal(&zero)
 
@@ -401,21 +401,21 @@ func BenchmarkExpGT(b *testing.B) {
 	b.Run("Naive windowed Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.Exp(&a, _e)
+			a.Exp(a, &_e)
 		}
 	})
 
 	b.Run("2-NAF cyclotomic Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.CyclotomicExp(&a, _e)
+			a.CyclotomicExp(a, &_e)
 		}
 	})
 
 	b.Run("windowed 2-dim GLV Exp", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			a.ExpGLV(&a, _e)
+			a.ExpGLV(a, &_e)
 		}
 	})
 }
diff --git a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl
index 5975f193b3..8fdcabdd00 100644
--- a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl
+++ b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl
@@ -398,9 +398,9 @@ func BatchInvertE12(a []E12) []E12 {
 	return res
 }
 
-// Exp sets z=x**k and returns it
+// Exp sets z=xᵏ (mod q¹²) and returns it
 // uses 2-bits windowed method
-func (z *E12) Exp(x *E12, k big.Int) *E12 {
+func (z *E12) Exp(x E12, k *big.Int) *E12 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
@@ -409,20 +409,20 @@ func (z *E12) Exp(x *E12, k big.Int) *E12 {
 	if k.Sign() == -1 {
 		// negative k, we invert
 		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		x.Inverse(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var res E12
 	var ops [3]E12
 
 	res.SetOne()
-	ops[0].Set(x)
+	ops[0].Set(&x)
 	ops[1].Square(&ops[0])
 	ops[2].Set(&ops[0]).Mul(&ops[2], &ops[1])
 
@@ -444,37 +444,37 @@ func (z *E12) Exp(x *E12, k big.Int) *E12 {
 	return z
 }
 
-// CyclotomicExp sets z=x**k and returns it
+// CyclotomicExp sets z=xᵏ (mod q¹²) and returns it
 // uses 2-NAF decomposition
 // x must be in the cyclotomic subgroup
 // TODO: use a windowed method
-func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 {
+func (z *E12) CyclotomicExp(x E12, k *big.Int) *E12 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
 	e := k
 	if k.Sign() == -1 {
-		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// negative k, we invert (=conjugate)
+		// if k < 0: xᵏ (mod q¹²) == (x⁻¹)ᵏ (mod q¹²)
+		x.Conjugate(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
 	var res, xInv E12
-	xInv.InverseUnitary(x)
+	xInv.InverseUnitary(&x)
 	res.SetOne()
 	eNAF := make([]int8, e.BitLen()+3)
-	n := ecc.NafDecomposition(&e, eNAF[:])
+	n := ecc.NafDecomposition(e, eNAF[:])
 	for i := n - 1; i >= 0; i-- {
 		res.CyclotomicSquare(&res)
 		if eNAF[i] == 1 {
-			res.Mul(&res, x)
+			res.Mul(&res, &x)
 		} else if eNAF[i] == -1 {
 			res.Mul(&res, &xInv)
 		}
@@ -483,41 +483,41 @@ func (z *E12) CyclotomicExp(x *E12, k big.Int) *E12 {
 	return z
 }
 
-// ExpGLV sets z=x**k and returns it
+// ExpGLV sets z=xᵏ (q¹²) and returns it
 // uses 2-dimensional GLV with 2-bits windowed method
 // x must be in GT
 // TODO: use 2-NAF
 // TODO: use higher dimensional decomposition
-func (z *E12) ExpGLV(x *E12, k big.Int) *E12 {
+func (z *E12) ExpGLV(x E12, k *big.Int) *E12 {
 	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
 	e := k
 	if k.Sign() == -1 {
-		// negative k, we invert
-		// if k < 0: xᵏ (mod q) == (x⁻¹)ᵏ (mod q)
-		x.Inverse(x)
+		// negative k, we invert (=conjugate)
+		// if k < 0: xᵏ (mod q¹²) == (x⁻¹)ᵏ (mod q¹²)
+		x.Conjugate(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		e = *bigIntPool.Get().(*big.Int)
+		e = bigIntPool.Get().(*big.Int)
 		defer bigIntPool.Put(e)
-		e.Neg(&k)
+		e.Neg(k)
 	}
 
- 	var table [15]E12
+	var table [15]E12
 	var res E12
 	var s1, s2 fr.Element
 
 	res.SetOne()
 
 	// table[b3b2b1b0-1] = b3b2*Frobinius(x) + b1b0*x
-	table[0].Set(x)
-	table[3].Frobenius(x)
+	table[0].Set(&x)
+	table[3].Frobenius(&x)
 
-	// split the scalar, modifies +-x, Frob(x) accordingly
-	s := ecc.SplitScalar(&e, &glvBasis)
+	// split the scalar, modifies ±x, Frob(x) accordingly
+	s := ecc.SplitScalar(e, &glvBasis)
 
 	if s[0].Sign() == -1 {
 		s[0].Neg(&s[0])
diff --git a/internal/generator/tower/template/fq12over6over2/fq2.go.tmpl b/internal/generator/tower/template/fq12over6over2/fq2.go.tmpl
index 35ed62c3e7..11612bc060 100644
--- a/internal/generator/tower/template/fq12over6over2/fq2.go.tmpl
+++ b/internal/generator/tower/template/fq12over6over2/fq2.go.tmpl
@@ -155,32 +155,32 @@ func (z *E2) Legendre() int {
 	return n.Legendre()
 }
 
-// Exp sets z=x**e and returns it
-func (z *E2) Exp(x E2, e *big.Int) *E2 {
-	if e.IsUint64() && e.Uint64() == 0 {
+// Exp sets z=xᵏ (mod q²) and returns it
+func (z *E2) Exp(x E2, k *big.Int) *E2 {
+	if k.IsUint64() && k.Uint64() == 0 {
 		return z.SetOne()
 	}
 
-	k := e
+	e := k
 	if k.Sign() == -1 {
 		// negative k, we invert
-		// if k < 0: xᵏ (mod q12) == (x⁻¹)ᵏ (mod q12)
+		// if k < 0: xᵏ (mod q²) == (x⁻¹)ᵏ (mod q²)
 		x.Inverse(&x)
 
 		// we negate k in a temp big.Int since
 		// Int.Bit(_) of k and -k is different
-		k = bigIntPool.Get().(*big.Int)
-		defer bigIntPool.Put(k)
-		k.Neg(k)
+		e = bigIntPool.Get().(*big.Int)
+		defer bigIntPool.Put(e)
+		e.Neg(k)
 	}
 
 	z.SetOne()
-    b := e.Bytes()
-    for i :=0;i<len(b); i++ {
+	b := e.Bytes()
+	for i := 0; i < len(b); i++ {
 		w := b[i]
 		for j := 0; j < 8; j++ {
 			z.Square(z)
-			if (w&(0b10000000 >> j)) != 0 {
+			if (w & (0b10000000 >> j)) != 0 {
 				z.Mul(z, &x)
 			}
 		}
diff --git a/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl b/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl
index 08b83a19cb..e94dc8988a 100644
--- a/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl
+++ b/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl
@@ -376,8 +376,8 @@ func TestE12Ops(t *testing.T) {
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
 
-			c.Exp(a, _e)
-			d.CyclotomicExp(a, _e)
+			c.Exp(*a, &_e)
+			d.CyclotomicExp(*a, &_e)
 
 			return c.Equal(&d)
 		},
@@ -390,7 +390,7 @@ func TestE12Ops(t *testing.T) {
 			var b, c E12
 			q := fp.Modulus()
 			b.Frobenius(a)
-			c.Exp(a, *q)
+			c.Exp(*a, q)
 			return c.Equal(&b)
 		},
 		genA,
@@ -401,7 +401,7 @@ func TestE12Ops(t *testing.T) {
 			var b, c E12
 			q := fp.Modulus()
 			b.FrobeniusSquare(a)
-			c.Exp(a, *q).Exp(&c, *q)
+			c.Exp(*a, q).Exp(c, q)
 			return c.Equal(&b)
 		},
 		genA,

From 5762868ce05a2482ab2688cec0e85618b50a7cc2 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Mon, 20 Jun 2022 11:23:27 +0100
Subject: [PATCH 13/16] test(tower): test negative exponent in Exp

---
 ecc/bls12-377/pairing_test.go                            | 9 ++++++---
 ecc/bls12-378/pairing_test.go                            | 9 ++++++---
 ecc/bls12-381/pairing_test.go                            | 9 ++++++---
 ecc/bls24-315/pairing_test.go                            | 9 ++++++---
 ecc/bls24-317/pairing_test.go                            | 9 ++++++---
 ecc/bn254/pairing_test.go                                | 9 ++++++---
 ecc/bw6-633/pairing_test.go                              | 9 ++++++---
 ecc/bw6-756/pairing_test.go                              | 9 ++++++---
 ecc/bw6-761/pairing_test.go                              | 9 ++++++---
 .../generator/pairing/template/tests/pairing.go.tmpl     | 9 ++++++---
 10 files changed, 60 insertions(+), 30 deletions(-)

diff --git a/ecc/bls12-377/pairing_test.go b/ecc/bls12-377/pairing_test.go
index cfa761546c..c64efebada 100644
--- a/ecc/bls12-377/pairing_test.go
+++ b/ecc/bls12-377/pairing_test.go
@@ -69,15 +69,18 @@ func TestPairing(t *testing.T) {
 		func(a GT, e fp.Element) bool {
 			a = FinalExponentiation(&a)
 
-			var _e big.Int
+			var _e, ne big.Int
 
 			k := new(big.Int).SetUint64(12)
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
+			ne.Neg(&_e)
 
 			var b, c, d GT
-			b.Exp(a, &_e)
-			c.ExpGLV(a, &_e)
+			b.Exp(a, &ne)
+			b.Inverse(&b)
+			c.ExpGLV(a, &ne)
+			c.Conjugate(&c)
 			d.CyclotomicExp(a, &_e)
 
 			return b.Equal(&c) && c.Equal(&d)
diff --git a/ecc/bls12-378/pairing_test.go b/ecc/bls12-378/pairing_test.go
index 020ace09a2..790d64d886 100644
--- a/ecc/bls12-378/pairing_test.go
+++ b/ecc/bls12-378/pairing_test.go
@@ -69,15 +69,18 @@ func TestPairing(t *testing.T) {
 		func(a GT, e fp.Element) bool {
 			a = FinalExponentiation(&a)
 
-			var _e big.Int
+			var _e, ne big.Int
 
 			k := new(big.Int).SetUint64(12)
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
+			ne.Neg(&_e)
 
 			var b, c, d GT
-			b.Exp(a, &_e)
-			c.ExpGLV(a, &_e)
+			b.Exp(a, &ne)
+			b.Inverse(&b)
+			c.ExpGLV(a, &ne)
+			c.Conjugate(&c)
 			d.CyclotomicExp(a, &_e)
 
 			return b.Equal(&c) && c.Equal(&d)
diff --git a/ecc/bls12-381/pairing_test.go b/ecc/bls12-381/pairing_test.go
index 5c45046b1c..3262379ef8 100644
--- a/ecc/bls12-381/pairing_test.go
+++ b/ecc/bls12-381/pairing_test.go
@@ -69,15 +69,18 @@ func TestPairing(t *testing.T) {
 		func(a GT, e fp.Element) bool {
 			a = FinalExponentiation(&a)
 
-			var _e big.Int
+			var _e, ne big.Int
 
 			k := new(big.Int).SetUint64(12)
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
+			ne.Neg(&_e)
 
 			var b, c, d GT
-			b.Exp(a, &_e)
-			c.ExpGLV(a, &_e)
+			b.Exp(a, &ne)
+			b.Inverse(&b)
+			c.ExpGLV(a, &ne)
+			c.Conjugate(&c)
 			d.CyclotomicExp(a, &_e)
 
 			return b.Equal(&c) && c.Equal(&d)
diff --git a/ecc/bls24-315/pairing_test.go b/ecc/bls24-315/pairing_test.go
index e0102f8864..44a645790f 100644
--- a/ecc/bls24-315/pairing_test.go
+++ b/ecc/bls24-315/pairing_test.go
@@ -70,16 +70,19 @@ func TestPairing(t *testing.T) {
 		func(a GT, e fp.Element) bool {
 			a = FinalExponentiation(&a)
 
-			var _e big.Int
+			var _e, ne big.Int
 
 			k := new(big.Int).SetUint64(24)
 
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
+			ne.Neg(&_e)
 
 			var b, c, d GT
-			b.Exp(a, &_e)
-			c.ExpGLV(a, &_e)
+			b.Exp(a, &ne)
+			b.Inverse(&b)
+			c.ExpGLV(a, &ne)
+			c.Conjugate(&c)
 			d.CyclotomicExp(a, &_e)
 
 			return b.Equal(&c) && c.Equal(&d)
diff --git a/ecc/bls24-317/pairing_test.go b/ecc/bls24-317/pairing_test.go
index 65bd739a75..23e7792c8f 100644
--- a/ecc/bls24-317/pairing_test.go
+++ b/ecc/bls24-317/pairing_test.go
@@ -70,15 +70,18 @@ func TestPairing(t *testing.T) {
 		func(a GT, e fp.Element) bool {
 			a = FinalExponentiation(&a)
 
-			var _e big.Int
+			var _e, ne big.Int
 
 			k := new(big.Int).SetUint64(12)
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
+			ne.Neg(&_e)
 
 			var b, c, d GT
-			b.Exp(a, &_e)
-			c.ExpGLV(a, &_e)
+			b.Exp(a, &ne)
+			b.Inverse(&b)
+			c.ExpGLV(a, &ne)
+			c.Conjugate(&c)
 			d.CyclotomicExp(a, &_e)
 
 			return b.Equal(&c) && c.Equal(&d)
diff --git a/ecc/bn254/pairing_test.go b/ecc/bn254/pairing_test.go
index e29ae8bb76..da33256083 100644
--- a/ecc/bn254/pairing_test.go
+++ b/ecc/bn254/pairing_test.go
@@ -69,15 +69,18 @@ func TestPairing(t *testing.T) {
 		func(a GT, e fp.Element) bool {
 			a = FinalExponentiation(&a)
 
-			var _e big.Int
+			var _e, ne big.Int
 
 			k := new(big.Int).SetUint64(12)
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
+			ne.Neg(&_e)
 
 			var b, c, d GT
-			b.Exp(a, &_e)
-			c.ExpGLV(a, &_e)
+			b.Exp(a, &ne)
+			b.Inverse(&b)
+			c.ExpGLV(a, &ne)
+			c.Conjugate(&c)
 			d.CyclotomicExp(a, &_e)
 
 			return b.Equal(&c) && c.Equal(&d)
diff --git a/ecc/bw6-633/pairing_test.go b/ecc/bw6-633/pairing_test.go
index 3a3fd30c80..f8a0f84bbf 100644
--- a/ecc/bw6-633/pairing_test.go
+++ b/ecc/bw6-633/pairing_test.go
@@ -70,16 +70,19 @@ func TestPairing(t *testing.T) {
 		func(a GT, e fp.Element) bool {
 			a = FinalExponentiation(&a)
 
-			var _e big.Int
+			var _e, ne big.Int
 
 			k := new(big.Int).SetUint64(6)
 
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
+			ne.Neg(&_e)
 
 			var b, c, d GT
-			b.Exp(a, &_e)
-			c.ExpGLV(a, &_e)
+			b.Exp(a, &ne)
+			b.Inverse(&b)
+			c.ExpGLV(a, &ne)
+			c.Conjugate(&c)
 			d.CyclotomicExp(a, &_e)
 
 			return b.Equal(&c) && c.Equal(&d)
diff --git a/ecc/bw6-756/pairing_test.go b/ecc/bw6-756/pairing_test.go
index ef0b6f670d..5c111bbd45 100644
--- a/ecc/bw6-756/pairing_test.go
+++ b/ecc/bw6-756/pairing_test.go
@@ -70,15 +70,18 @@ func TestPairing(t *testing.T) {
 		func(a GT, e fp.Element) bool {
 			a = FinalExponentiation(&a)
 
-			var _e big.Int
+			var _e, ne big.Int
 
 			k := new(big.Int).SetUint64(12)
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
+			ne.Neg(&_e)
 
 			var b, c, d GT
-			b.Exp(a, &_e)
-			c.ExpGLV(a, &_e)
+			b.Exp(a, &ne)
+			b.Inverse(&b)
+			c.ExpGLV(a, &ne)
+			c.Conjugate(&c)
 			d.CyclotomicExp(a, &_e)
 
 			return b.Equal(&c) && c.Equal(&d)
diff --git a/ecc/bw6-761/pairing_test.go b/ecc/bw6-761/pairing_test.go
index e92c59fa99..76bf81eb3a 100644
--- a/ecc/bw6-761/pairing_test.go
+++ b/ecc/bw6-761/pairing_test.go
@@ -70,16 +70,19 @@ func TestPairing(t *testing.T) {
 		func(a GT, e fp.Element) bool {
 			a = FinalExponentiation(&a)
 
-			var _e big.Int
+			var _e, ne big.Int
 
 			k := new(big.Int).SetUint64(6)
 
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
+			ne.Neg(&_e)
 
 			var b, c, d GT
-			b.Exp(a, &_e)
-			c.ExpGLV(a, &_e)
+			b.Exp(a, &ne)
+			b.Inverse(&b)
+			c.ExpGLV(a, &ne)
+			c.Conjugate(&c)
 			d.CyclotomicExp(a, &_e)
 
 			return b.Equal(&c) && c.Equal(&d)
diff --git a/internal/generator/pairing/template/tests/pairing.go.tmpl b/internal/generator/pairing/template/tests/pairing.go.tmpl
index 5ea344b836..a8209b7ac2 100644
--- a/internal/generator/pairing/template/tests/pairing.go.tmpl
+++ b/internal/generator/pairing/template/tests/pairing.go.tmpl
@@ -57,7 +57,7 @@ func TestPairing(t *testing.T) {
 		func(a GT, e fp.Element) bool {
 			a = FinalExponentiation(&a)
 
-			var _e big.Int
+			var _e, ne big.Int
             {{if or (eq .Name "bw6-761") (eq .Name "bw6-633")}}
             k := new(big.Int).SetUint64(6)
             {{else if eq .Name "bls24-315"}}
@@ -67,10 +67,13 @@ func TestPairing(t *testing.T) {
             {{- end}}
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
+            ne.Neg(&_e)
 
 			var b, c, d GT
-			b.Exp(a, &_e)
-			c.ExpGLV(a, &_e)
+			b.Exp(a, &ne)
+			b.Inverse(&b)
+			c.ExpGLV(a, &ne)
+			c.Conjugate(&c)
 			d.CyclotomicExp(a, &_e)
 
 			return b.Equal(&c) && c.Equal(&d)

From c92a1d83ca94d74b95750d2be0f7dc5f322d129a Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Mon, 20 Jun 2022 18:01:11 +0100
Subject: [PATCH 14/16] =?UTF-8?q?feat:=20Multiexp=20=C3=A0=20la=20Pippenge?=
 =?UTF-8?q?r=20in=20GT?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ecc/bls12-377/pairing_test.go              |    4 +-
 ecc/bls12-381/internal/fptower/multiexp.go | 1229 ++++++++++++++++++++
 ecc/bls12-381/pairing_test.go              |  177 +++
 ecc/bls24-315/internal/fptower/multiexp.go | 1229 ++++++++++++++++++++
 ecc/bls24-315/pairing_test.go              |  182 ++-
 ecc/bls24-317/internal/fptower/multiexp.go | 1229 ++++++++++++++++++++
 ecc/bls24-317/pairing_test.go              |  180 ++-
 ecc/bn254/internal/fptower/multiexp.go     | 1229 ++++++++++++++++++++
 ecc/bn254/pairing_test.go                  |  177 +++
 ecc/bw6-633/pairing_test.go                |  180 ++-
 ecc/bw6-756/internal/fptower/multiexp.go   |  569 +++++++++
 ecc/bw6-756/pairing_test.go                |  182 ++-
 ecc/bw6-761/internal/fptower/multiexp.go   |  569 +++++++++
 ecc/bw6-761/pairing_test.go                |  180 ++-
 14 files changed, 7299 insertions(+), 17 deletions(-)
 create mode 100644 ecc/bls12-381/internal/fptower/multiexp.go
 create mode 100644 ecc/bls24-315/internal/fptower/multiexp.go
 create mode 100644 ecc/bls24-317/internal/fptower/multiexp.go
 create mode 100644 ecc/bn254/internal/fptower/multiexp.go
 create mode 100644 ecc/bw6-756/internal/fptower/multiexp.go
 create mode 100644 ecc/bw6-761/internal/fptower/multiexp.go

diff --git a/ecc/bls12-377/pairing_test.go b/ecc/bls12-377/pairing_test.go
index fb33dd50da..49a6430e88 100644
--- a/ecc/bls12-377/pairing_test.go
+++ b/ecc/bls12-377/pairing_test.go
@@ -362,7 +362,7 @@ func TestMultiExpGT(t *testing.T) {
 			// compute expected result with double and add
 			var finalScalar, mixerBigInt big.Int
 			finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt))
-			expected.ExpGLV(&_g, &finalScalar)
+			expected.ExpGLV(_g, &finalScalar)
 
 			// mixer ensures that all the words of a fpElement are set
 			var sampleScalars [nbSamples]fr.Element
@@ -417,7 +417,7 @@ func TestMultiExpGT(t *testing.T) {
 			var op1ScalarMul GT
 			finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer)
 			finalBigScalar.ToBigIntRegular(&finalBigScalarBi)
-			op1ScalarMul.ExpGLV(&_g, &finalBigScalarBi)
+			op1ScalarMul.ExpGLV(_g, &finalBigScalarBi)
 
 			return op1ScalarMul.Equal(&op1MultiExp)
 		},
diff --git a/ecc/bls12-381/internal/fptower/multiexp.go b/ecc/bls12-381/internal/fptower/multiexp.go
new file mode 100644
index 0000000000..a79d38b06e
--- /dev/null
+++ b/ecc/bls12-381/internal/fptower/multiexp.go
@@ -0,0 +1,1229 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package fptower
+
+import (
+	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bls12-381/fr"
+	"github.com/consensys/gnark-crypto/internal/parallel"
+	"math"
+	"runtime"
+)
+
+/* Multi-Exponentiation à la Pippenger */
+
+// selector stores the index, mask and shifts needed to select bits from a scalar
+// it is used during the multiExp algorithm or the batch scalar multiplication
+type selector struct {
+	index uint64 // index in the multi-word scalar to select bits from
+	mask  uint64 // mask (c-bit wide)
+	shift uint64 // shift needed to get our bits on low positions
+
+	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
+	maskHigh        uint64 // same than mask, for index+1
+	shiftHigh       uint64 // same than shift, for index+1
+}
+
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+	toReturn := make([]fr.Element, len(scalars))
+
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
+
+	mask := uint64((1 << c) - 1)      // low c bits are 1
+	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
+	max := int(1 << (c - 1))          // max value we want for our digits
+	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
+
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
+		}
+		selectors[chunk] = d
+	}
+
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
+
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
+
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.IsUint64() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
+
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
+
+				// init with carry if any
+				digit := carry
+				carry = 0
+
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
+
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
+
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
+
+				var bits uint64
+				if digit >= 0 {
+					bits = uint64(digit)
+				} else {
+					bits = uint64(-digit-1) | msbWindow
+				}
+
+				toReturn[i][s.index] |= (bits << s.shift)
+				if s.multiWordSelect {
+					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				}
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
+}
+
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+func (p *E12) MultiExp(points []E12, scalars []fr.Element, config ecc.MultiExpConfig) (*E12, error) {
+	// note:
+	// each of the MsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each MsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented MsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerE12 , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]E12, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerE12(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerE12(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.Mul(p, &_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerE12(p *E12, c int, points []E12, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.MsmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.MsmC5(points, scalars, splitFirstChunk)
+
+	case 6:
+		p.MsmC6(points, scalars, splitFirstChunk)
+
+	case 7:
+		p.MsmC7(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.MsmC8(points, scalars, splitFirstChunk)
+
+	case 9:
+		p.MsmC9(points, scalars, splitFirstChunk)
+
+	case 10:
+		p.MsmC10(points, scalars, splitFirstChunk)
+
+	case 11:
+		p.MsmC11(points, scalars, splitFirstChunk)
+
+	case 12:
+		p.MsmC12(points, scalars, splitFirstChunk)
+
+	case 13:
+		p.MsmC13(points, scalars, splitFirstChunk)
+
+	case 14:
+		p.MsmC14(points, scalars, splitFirstChunk)
+
+	case 15:
+		p.MsmC15(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.MsmC16(points, scalars, splitFirstChunk)
+
+	case 20:
+		p.MsmC20(points, scalars, splitFirstChunk)
+
+	case 21:
+		p.MsmC21(points, scalars, splitFirstChunk)
+
+	case 22:
+		p.MsmC22(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkE12 reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkE12(p *E12, c int, chChunks []chan E12) *E12 {
+	var _p E12
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.CyclotomicSquare(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.Mul(&_p, &totalj)
+	}
+
+	p.Set(&_p)
+	return p
+}
+
+func msmProcessChunkE12(chunk uint64,
+	chRes chan<- E12,
+	buckets []E12,
+	c uint64,
+	points []E12,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].SetOne()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		var tmp E12
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].Mul(&buckets[bits-1], &points[i])
+		} else {
+			// sub
+			tmp.Conjugate(&points[i])
+			buckets[bits & ^msbWindow].Mul(&buckets[bits & ^msbWindow], &tmp)
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var one, runningSum, total E12
+	runningSum.SetOne()
+	total.SetOne()
+	one.SetOne()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].Equal(&one) {
+			runningSum.Mul(&runningSum, &buckets[k])
+		}
+		total.Mul(&total, &runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *E12) MsmC4(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 4                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC5(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 5                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC6(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 6                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC7(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 7                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC8(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 8                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC9(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 9                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC10(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 10                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC11(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 11                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC12(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 12                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC13(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 13                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC14(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 14                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC15(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 15                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC16(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC20(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 20                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC21(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 21                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC22(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 22                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
diff --git a/ecc/bls12-381/pairing_test.go b/ecc/bls12-381/pairing_test.go
index 3262379ef8..1162ce3a97 100644
--- a/ecc/bls12-381/pairing_test.go
+++ b/ecc/bls12-381/pairing_test.go
@@ -19,8 +19,11 @@ package bls12381
 import (
 	"fmt"
 	"math/big"
+	"math/bits"
+	"runtime"
 	"testing"
 
+	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-381/fp"
 	"github.com/consensys/gnark-crypto/ecc/bls12-381/fr"
 	"github.com/leanovate/gopter"
@@ -284,6 +287,146 @@ func TestMillerLoop(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestMultiExpGT(t *testing.T) {
+
+	parameters := gopter.DefaultTestParameters()
+	parameters.MinSuccessfulTests = 2
+
+	properties := gopter.NewProperties(parameters)
+
+	genScalar := GenFr()
+
+	// size of the multiExps
+	const nbSamples = 143
+
+	// multi exp points
+	var samplePoints [nbSamples]GT
+	var _g, g GT
+	_g.SetRandom()
+
+	// put into GT
+	_g = FinalExponentiation(&_g)
+
+	g.Set(&_g)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].Set(&g)
+		g.Mul(&g, &_g)
+	}
+
+	// final scalar to use in double and add method (without mixer factor)
+	// n(n+1)(2n+1)/6  (sum of the squares from 1 to n)
+	var scalar big.Int
+	scalar.SetInt64(nbSamples)
+	scalar.Mul(&scalar, new(big.Int).SetInt64(nbSamples+1))
+	scalar.Mul(&scalar, new(big.Int).SetInt64(2*nbSamples+1))
+	scalar.Div(&scalar, new(big.Int).SetInt64(6))
+
+	// ensure a multiexp that's splitted has the same result as a non-splitted one..
+	properties.Property("[GT] Multi exponentation (c=16) should be consistant with splitted multiexp", prop.ForAll(
+		func(mixer fr.Element) bool {
+			var samplePointsLarge [nbSamples * 13]GT
+			for i := 0; i < 13; i++ {
+				copy(samplePointsLarge[i*nbSamples:], samplePoints[:])
+			}
+
+			var r16, splitted1, splitted2 GT
+
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples * 13]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
+			r16.MsmC16(samplePoints[:], scalars16, true)
+
+			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
+			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
+			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
+		},
+		genScalar,
+	))
+
+	// we test only c = 5 and c = 16
+	properties.Property("[GT] Multi exponentation (c=5, c=16) should be consistant with sum of square", prop.ForAll(
+		func(mixer fr.Element) bool {
+
+			var expected, g GT
+			g.SetRandom()
+			// put into GT
+			g = FinalExponentiation(&_g)
+
+			// compute expected result with double and add
+			var finalScalar, mixerBigInt big.Int
+			finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt))
+			expected.ExpGLV(_g, &finalScalar)
+
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			scalars5, _ := partitionScalars(sampleScalars[:], 5, false, runtime.NumCPU())
+			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
+
+			var r5, r16 GT
+			r5.MsmC5(samplePoints[:], scalars5, false)
+			r16.MsmC16(samplePoints[:], scalars16, true)
+			return (r5.Equal(&expected) && r16.Equal(&expected))
+		},
+		genScalar,
+	))
+
+	// note : this test is here as we expect to have a different multiExp than the above bucket method
+	// for small number of points
+	properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll(
+		func(mixer fr.Element) bool {
+
+			var _g, g GT
+			_g.SetRandom()
+
+			// put into GT
+			_g = FinalExponentiation(&_g)
+
+			g.Set(&_g)
+
+			// mixer ensures that all the words of a fpElement are set
+			samplePoints := make([]GT, 30)
+			sampleScalars := make([]fr.Element, 30)
+
+			for i := 1; i <= 30; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+				samplePoints[i-1].Set(&g)
+				g.Mul(&g, &_g)
+			}
+
+			var op1MultiExp GT
+			op1MultiExp.MultiExp(samplePoints, sampleScalars, ecc.MultiExpConfig{})
+
+			var finalBigScalar fr.Element
+			var finalBigScalarBi big.Int
+			var op1ScalarMul GT
+			finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer)
+			finalBigScalar.ToBigIntRegular(&finalBigScalarBi)
+			op1ScalarMul.ExpGLV(_g, &finalBigScalarBi)
+
+			return op1ScalarMul.Equal(&op1MultiExp)
+		},
+		genScalar,
+	))
+
+	properties.TestingRun(t, gopter.ConsoleReporter(false))
+}
+
 // ------------------------------------------------------------
 // benches
 
@@ -414,3 +557,37 @@ func BenchmarkExpGT(b *testing.B) {
 		}
 	})
 }
+
+func BenchmarkMultiExpGT(b *testing.B) {
+	// ensure every words of the scalars are filled
+	var mixer fr.Element
+	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
+
+	const pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits
+	const nbSamples = 1 << pow
+
+	var samplePoints [nbSamples]GT
+	var sampleScalars [nbSamples]fr.Element
+	var _g GT
+	_g.SetRandom()
+
+	for i := 1; i <= nbSamples; i++ {
+		sampleScalars[i-1].SetUint64(uint64(i)).
+			Mul(&sampleScalars[i-1], &mixer).
+			FromMont()
+		samplePoints[i-1].Set(&_g)
+	}
+
+	var testPoint GT
+
+	for i := 5; i <= pow; i++ {
+		using := 1 << i
+
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
+	}
+}
diff --git a/ecc/bls24-315/internal/fptower/multiexp.go b/ecc/bls24-315/internal/fptower/multiexp.go
new file mode 100644
index 0000000000..72d09fce70
--- /dev/null
+++ b/ecc/bls24-315/internal/fptower/multiexp.go
@@ -0,0 +1,1229 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package fptower
+
+import (
+	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bls24-315/fr"
+	"github.com/consensys/gnark-crypto/internal/parallel"
+	"math"
+	"runtime"
+)
+
+/* Multi-Exponentiation à la Pippenger */
+
+// selector stores the index, mask and shifts needed to select bits from a scalar
+// it is used during the multiExp algorithm or the batch scalar multiplication
+type selector struct {
+	index uint64 // index in the multi-word scalar to select bits from
+	mask  uint64 // mask (c-bit wide)
+	shift uint64 // shift needed to get our bits on low positions
+
+	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
+	maskHigh        uint64 // same than mask, for index+1
+	shiftHigh       uint64 // same than shift, for index+1
+}
+
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+	toReturn := make([]fr.Element, len(scalars))
+
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
+
+	mask := uint64((1 << c) - 1)      // low c bits are 1
+	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
+	max := int(1 << (c - 1))          // max value we want for our digits
+	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
+
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
+		}
+		selectors[chunk] = d
+	}
+
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
+
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
+
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.IsUint64() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
+
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
+
+				// init with carry if any
+				digit := carry
+				carry = 0
+
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
+
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
+
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
+
+				var bits uint64
+				if digit >= 0 {
+					bits = uint64(digit)
+				} else {
+					bits = uint64(-digit-1) | msbWindow
+				}
+
+				toReturn[i][s.index] |= (bits << s.shift)
+				if s.multiWordSelect {
+					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				}
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
+}
+
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+func (p *E24) MultiExp(points []E24, scalars []fr.Element, config ecc.MultiExpConfig) (*E24, error) {
+	// note:
+	// each of the MsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each MsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented MsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerE24 , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]E24, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerE24(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerE24(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.Mul(p, &_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerE24(p *E24, c int, points []E24, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.MsmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.MsmC5(points, scalars, splitFirstChunk)
+
+	case 6:
+		p.MsmC6(points, scalars, splitFirstChunk)
+
+	case 7:
+		p.MsmC7(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.MsmC8(points, scalars, splitFirstChunk)
+
+	case 9:
+		p.MsmC9(points, scalars, splitFirstChunk)
+
+	case 10:
+		p.MsmC10(points, scalars, splitFirstChunk)
+
+	case 11:
+		p.MsmC11(points, scalars, splitFirstChunk)
+
+	case 12:
+		p.MsmC12(points, scalars, splitFirstChunk)
+
+	case 13:
+		p.MsmC13(points, scalars, splitFirstChunk)
+
+	case 14:
+		p.MsmC14(points, scalars, splitFirstChunk)
+
+	case 15:
+		p.MsmC15(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.MsmC16(points, scalars, splitFirstChunk)
+
+	case 20:
+		p.MsmC20(points, scalars, splitFirstChunk)
+
+	case 21:
+		p.MsmC21(points, scalars, splitFirstChunk)
+
+	case 22:
+		p.MsmC22(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkE24 reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkE24(p *E24, c int, chChunks []chan E24) *E24 {
+	var _p E24
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.CyclotomicSquare(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.Mul(&_p, &totalj)
+	}
+
+	p.Set(&_p)
+	return p
+}
+
+func msmProcessChunkE24(chunk uint64,
+	chRes chan<- E24,
+	buckets []E24,
+	c uint64,
+	points []E24,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].SetOne()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		var tmp E24
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].Mul(&buckets[bits-1], &points[i])
+		} else {
+			// sub
+			tmp.Conjugate(&points[i])
+			buckets[bits & ^msbWindow].Mul(&buckets[bits & ^msbWindow], &tmp)
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var one, runningSum, total E24
+	runningSum.SetOne()
+	total.SetOne()
+	one.SetOne()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].Equal(&one) {
+			runningSum.Mul(&runningSum, &buckets[k])
+		}
+		total.Mul(&total, &runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *E24) MsmC4(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 4                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC5(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 5                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC6(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 6                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC7(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 7                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC8(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 8                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC9(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 9                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC10(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 10                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC11(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 11                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC12(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 12                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC13(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 13                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC14(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 14                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC15(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 15                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC16(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC20(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 20                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC21(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 21                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC22(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 22                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
diff --git a/ecc/bls24-315/pairing_test.go b/ecc/bls24-315/pairing_test.go
index 44a645790f..1663205050 100644
--- a/ecc/bls24-315/pairing_test.go
+++ b/ecc/bls24-315/pairing_test.go
@@ -19,8 +19,11 @@ package bls24315
 import (
 	"fmt"
 	"math/big"
+	"math/bits"
+	"runtime"
 	"testing"
 
+	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls24-315/fp"
 	"github.com/consensys/gnark-crypto/ecc/bls24-315/fr"
 	"github.com/leanovate/gopter"
@@ -43,7 +46,6 @@ func TestPairing(t *testing.T) {
 	properties := gopter.NewProperties(parameters)
 
 	genA := GenE24()
-
 	genR1 := GenFr()
 	genR2 := GenFr()
 	genP := GenFp()
@@ -73,7 +75,6 @@ func TestPairing(t *testing.T) {
 			var _e, ne big.Int
 
 			k := new(big.Int).SetUint64(24)
-
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
 			ne.Neg(&_e)
@@ -286,6 +287,146 @@ func TestMillerLoop(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestMultiExpGT(t *testing.T) {
+
+	parameters := gopter.DefaultTestParameters()
+	parameters.MinSuccessfulTests = 2
+
+	properties := gopter.NewProperties(parameters)
+
+	genScalar := GenFr()
+
+	// size of the multiExps
+	const nbSamples = 143
+
+	// multi exp points
+	var samplePoints [nbSamples]GT
+	var _g, g GT
+	_g.SetRandom()
+
+	// put into GT
+	_g = FinalExponentiation(&_g)
+
+	g.Set(&_g)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].Set(&g)
+		g.Mul(&g, &_g)
+	}
+
+	// final scalar to use in double and add method (without mixer factor)
+	// n(n+1)(2n+1)/6  (sum of the squares from 1 to n)
+	var scalar big.Int
+	scalar.SetInt64(nbSamples)
+	scalar.Mul(&scalar, new(big.Int).SetInt64(nbSamples+1))
+	scalar.Mul(&scalar, new(big.Int).SetInt64(2*nbSamples+1))
+	scalar.Div(&scalar, new(big.Int).SetInt64(6))
+
+	// ensure a multiexp that's splitted has the same result as a non-splitted one..
+	properties.Property("[GT] Multi exponentation (c=16) should be consistant with splitted multiexp", prop.ForAll(
+		func(mixer fr.Element) bool {
+			var samplePointsLarge [nbSamples * 13]GT
+			for i := 0; i < 13; i++ {
+				copy(samplePointsLarge[i*nbSamples:], samplePoints[:])
+			}
+
+			var r16, splitted1, splitted2 GT
+
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples * 13]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
+			r16.MsmC16(samplePoints[:], scalars16, true)
+
+			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
+			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
+			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
+		},
+		genScalar,
+	))
+
+	// we test only c = 5 and c = 16
+	properties.Property("[GT] Multi exponentation (c=5, c=16) should be consistant with sum of square", prop.ForAll(
+		func(mixer fr.Element) bool {
+
+			var expected, g GT
+			g.SetRandom()
+			// put into GT
+			g = FinalExponentiation(&_g)
+
+			// compute expected result with double and add
+			var finalScalar, mixerBigInt big.Int
+			finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt))
+			expected.ExpGLV(_g, &finalScalar)
+
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			scalars5, _ := partitionScalars(sampleScalars[:], 5, false, runtime.NumCPU())
+			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
+
+			var r5, r16 GT
+			r5.MsmC5(samplePoints[:], scalars5, false)
+			r16.MsmC16(samplePoints[:], scalars16, true)
+			return (r5.Equal(&expected) && r16.Equal(&expected))
+		},
+		genScalar,
+	))
+
+	// note : this test is here as we expect to have a different multiExp than the above bucket method
+	// for small number of points
+	properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll(
+		func(mixer fr.Element) bool {
+
+			var _g, g GT
+			_g.SetRandom()
+
+			// put into GT
+			_g = FinalExponentiation(&_g)
+
+			g.Set(&_g)
+
+			// mixer ensures that all the words of a fpElement are set
+			samplePoints := make([]GT, 30)
+			sampleScalars := make([]fr.Element, 30)
+
+			for i := 1; i <= 30; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+				samplePoints[i-1].Set(&g)
+				g.Mul(&g, &_g)
+			}
+
+			var op1MultiExp GT
+			op1MultiExp.MultiExp(samplePoints, sampleScalars, ecc.MultiExpConfig{})
+
+			var finalBigScalar fr.Element
+			var finalBigScalarBi big.Int
+			var op1ScalarMul GT
+			finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer)
+			finalBigScalar.ToBigIntRegular(&finalBigScalarBi)
+			op1ScalarMul.ExpGLV(_g, &finalBigScalarBi)
+
+			return op1ScalarMul.Equal(&op1MultiExp)
+		},
+		genScalar,
+	))
+
+	properties.TestingRun(t, gopter.ConsoleReporter(false))
+}
+
 // ------------------------------------------------------------
 // benches
 
@@ -390,8 +531,7 @@ func BenchmarkExpGT(b *testing.B) {
 	var e fp.Element
 	e.SetRandom()
 
-	k := new(big.Int).SetUint64(24)
-
+	k := new(big.Int).SetUint64(12)
 	e.Exp(e, k)
 	var _e big.Int
 	e.ToBigIntRegular(&_e)
@@ -417,3 +557,37 @@ func BenchmarkExpGT(b *testing.B) {
 		}
 	})
 }
+
+func BenchmarkMultiExpGT(b *testing.B) {
+	// ensure every words of the scalars are filled
+	var mixer fr.Element
+	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
+
+	const pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits
+	const nbSamples = 1 << pow
+
+	var samplePoints [nbSamples]GT
+	var sampleScalars [nbSamples]fr.Element
+	var _g GT
+	_g.SetRandom()
+
+	for i := 1; i <= nbSamples; i++ {
+		sampleScalars[i-1].SetUint64(uint64(i)).
+			Mul(&sampleScalars[i-1], &mixer).
+			FromMont()
+		samplePoints[i-1].Set(&_g)
+	}
+
+	var testPoint GT
+
+	for i := 5; i <= pow; i++ {
+		using := 1 << i
+
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
+	}
+}
diff --git a/ecc/bls24-317/internal/fptower/multiexp.go b/ecc/bls24-317/internal/fptower/multiexp.go
new file mode 100644
index 0000000000..7bc7ef3d1a
--- /dev/null
+++ b/ecc/bls24-317/internal/fptower/multiexp.go
@@ -0,0 +1,1229 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package fptower
+
+import (
+	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bls24-317/fr"
+	"github.com/consensys/gnark-crypto/internal/parallel"
+	"math"
+	"runtime"
+)
+
+/* Multi-Exponentiation à la Pippenger */
+
+// selector stores the index, mask and shifts needed to select bits from a scalar
+// it is used during the multiExp algorithm or the batch scalar multiplication
+type selector struct {
+	index uint64 // index in the multi-word scalar to select bits from
+	mask  uint64 // mask (c-bit wide)
+	shift uint64 // shift needed to get our bits on low positions
+
+	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
+	maskHigh        uint64 // same than mask, for index+1
+	shiftHigh       uint64 // same than shift, for index+1
+}
+
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+	toReturn := make([]fr.Element, len(scalars))
+
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
+
+	mask := uint64((1 << c) - 1)      // low c bits are 1
+	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
+	max := int(1 << (c - 1))          // max value we want for our digits
+	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
+
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
+		}
+		selectors[chunk] = d
+	}
+
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
+
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
+
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.IsUint64() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
+
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
+
+				// init with carry if any
+				digit := carry
+				carry = 0
+
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
+
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
+
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
+
+				var bits uint64
+				if digit >= 0 {
+					bits = uint64(digit)
+				} else {
+					bits = uint64(-digit-1) | msbWindow
+				}
+
+				toReturn[i][s.index] |= (bits << s.shift)
+				if s.multiWordSelect {
+					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				}
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
+}
+
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+func (p *E24) MultiExp(points []E24, scalars []fr.Element, config ecc.MultiExpConfig) (*E24, error) {
+	// note:
+	// each of the MsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each MsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented MsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerE24 , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]E24, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerE24(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerE24(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.Mul(p, &_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerE24(p *E24, c int, points []E24, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.MsmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.MsmC5(points, scalars, splitFirstChunk)
+
+	case 6:
+		p.MsmC6(points, scalars, splitFirstChunk)
+
+	case 7:
+		p.MsmC7(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.MsmC8(points, scalars, splitFirstChunk)
+
+	case 9:
+		p.MsmC9(points, scalars, splitFirstChunk)
+
+	case 10:
+		p.MsmC10(points, scalars, splitFirstChunk)
+
+	case 11:
+		p.MsmC11(points, scalars, splitFirstChunk)
+
+	case 12:
+		p.MsmC12(points, scalars, splitFirstChunk)
+
+	case 13:
+		p.MsmC13(points, scalars, splitFirstChunk)
+
+	case 14:
+		p.MsmC14(points, scalars, splitFirstChunk)
+
+	case 15:
+		p.MsmC15(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.MsmC16(points, scalars, splitFirstChunk)
+
+	case 20:
+		p.MsmC20(points, scalars, splitFirstChunk)
+
+	case 21:
+		p.MsmC21(points, scalars, splitFirstChunk)
+
+	case 22:
+		p.MsmC22(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkE24 reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkE24(p *E24, c int, chChunks []chan E24) *E24 {
+	var _p E24
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.CyclotomicSquare(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.Mul(&_p, &totalj)
+	}
+
+	p.Set(&_p)
+	return p
+}
+
+func msmProcessChunkE24(chunk uint64,
+	chRes chan<- E24,
+	buckets []E24,
+	c uint64,
+	points []E24,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].SetOne()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		var tmp E24
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].Mul(&buckets[bits-1], &points[i])
+		} else {
+			// sub
+			tmp.Conjugate(&points[i])
+			buckets[bits & ^msbWindow].Mul(&buckets[bits & ^msbWindow], &tmp)
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var one, runningSum, total E24
+	runningSum.SetOne()
+	total.SetOne()
+	one.SetOne()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].Equal(&one) {
+			runningSum.Mul(&runningSum, &buckets[k])
+		}
+		total.Mul(&total, &runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *E24) MsmC4(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 4                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC5(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 5                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC6(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 6                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC7(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 7                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC8(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 8                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC9(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 9                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC10(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 10                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC11(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 11                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC12(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 12                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC13(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 13                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC14(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 14                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC15(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 15                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC16(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC20(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 20                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC21(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 21                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
+
+func (p *E24) MsmC22(points []E24, scalars []fr.Element, splitFirstChunk bool) *E24 {
+	const (
+		c        = 22                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E24
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E24, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E24, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E24
+		msmProcessChunkE24(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E24, scalars []fr.Element, chChunk chan E24) {
+		var buckets [1 << (c - 1)]E24
+		msmProcessChunkE24(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E24, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE24(p, c, chChunks[:])
+}
diff --git a/ecc/bls24-317/pairing_test.go b/ecc/bls24-317/pairing_test.go
index 23e7792c8f..55b48fe256 100644
--- a/ecc/bls24-317/pairing_test.go
+++ b/ecc/bls24-317/pairing_test.go
@@ -19,8 +19,11 @@ package bls24317
 import (
 	"fmt"
 	"math/big"
+	"math/bits"
+	"runtime"
 	"testing"
 
+	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls24-317/fp"
 	"github.com/consensys/gnark-crypto/ecc/bls24-317/fr"
 	"github.com/leanovate/gopter"
@@ -43,7 +46,6 @@ func TestPairing(t *testing.T) {
 	properties := gopter.NewProperties(parameters)
 
 	genA := GenE24()
-
 	genR1 := GenFr()
 	genR2 := GenFr()
 	genP := GenFp()
@@ -285,6 +287,146 @@ func TestMillerLoop(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestMultiExpGT(t *testing.T) {
+
+	parameters := gopter.DefaultTestParameters()
+	parameters.MinSuccessfulTests = 2
+
+	properties := gopter.NewProperties(parameters)
+
+	genScalar := GenFr()
+
+	// size of the multiExps
+	const nbSamples = 143
+
+	// multi exp points
+	var samplePoints [nbSamples]GT
+	var _g, g GT
+	_g.SetRandom()
+
+	// put into GT
+	_g = FinalExponentiation(&_g)
+
+	g.Set(&_g)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].Set(&g)
+		g.Mul(&g, &_g)
+	}
+
+	// final scalar to use in double and add method (without mixer factor)
+	// n(n+1)(2n+1)/6  (sum of the squares from 1 to n)
+	var scalar big.Int
+	scalar.SetInt64(nbSamples)
+	scalar.Mul(&scalar, new(big.Int).SetInt64(nbSamples+1))
+	scalar.Mul(&scalar, new(big.Int).SetInt64(2*nbSamples+1))
+	scalar.Div(&scalar, new(big.Int).SetInt64(6))
+
+	// ensure a multiexp that's splitted has the same result as a non-splitted one..
+	properties.Property("[GT] Multi exponentation (c=16) should be consistant with splitted multiexp", prop.ForAll(
+		func(mixer fr.Element) bool {
+			var samplePointsLarge [nbSamples * 13]GT
+			for i := 0; i < 13; i++ {
+				copy(samplePointsLarge[i*nbSamples:], samplePoints[:])
+			}
+
+			var r16, splitted1, splitted2 GT
+
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples * 13]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
+			r16.MsmC16(samplePoints[:], scalars16, true)
+
+			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
+			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
+			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
+		},
+		genScalar,
+	))
+
+	// we test only c = 5 and c = 16
+	properties.Property("[GT] Multi exponentation (c=5, c=16) should be consistant with sum of square", prop.ForAll(
+		func(mixer fr.Element) bool {
+
+			var expected, g GT
+			g.SetRandom()
+			// put into GT
+			g = FinalExponentiation(&_g)
+
+			// compute expected result with double and add
+			var finalScalar, mixerBigInt big.Int
+			finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt))
+			expected.ExpGLV(_g, &finalScalar)
+
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			scalars5, _ := partitionScalars(sampleScalars[:], 5, false, runtime.NumCPU())
+			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
+
+			var r5, r16 GT
+			r5.MsmC5(samplePoints[:], scalars5, false)
+			r16.MsmC16(samplePoints[:], scalars16, true)
+			return (r5.Equal(&expected) && r16.Equal(&expected))
+		},
+		genScalar,
+	))
+
+	// note : this test is here as we expect to have a different multiExp than the above bucket method
+	// for small number of points
+	properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll(
+		func(mixer fr.Element) bool {
+
+			var _g, g GT
+			_g.SetRandom()
+
+			// put into GT
+			_g = FinalExponentiation(&_g)
+
+			g.Set(&_g)
+
+			// mixer ensures that all the words of a fpElement are set
+			samplePoints := make([]GT, 30)
+			sampleScalars := make([]fr.Element, 30)
+
+			for i := 1; i <= 30; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+				samplePoints[i-1].Set(&g)
+				g.Mul(&g, &_g)
+			}
+
+			var op1MultiExp GT
+			op1MultiExp.MultiExp(samplePoints, sampleScalars, ecc.MultiExpConfig{})
+
+			var finalBigScalar fr.Element
+			var finalBigScalarBi big.Int
+			var op1ScalarMul GT
+			finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer)
+			finalBigScalar.ToBigIntRegular(&finalBigScalarBi)
+			op1ScalarMul.ExpGLV(_g, &finalBigScalarBi)
+
+			return op1ScalarMul.Equal(&op1MultiExp)
+		},
+		genScalar,
+	))
+
+	properties.TestingRun(t, gopter.ConsoleReporter(false))
+}
+
 // ------------------------------------------------------------
 // benches
 
@@ -389,7 +531,7 @@ func BenchmarkExpGT(b *testing.B) {
 	var e fp.Element
 	e.SetRandom()
 
-	k := new(big.Int).SetUint64(12)
+	k := new(big.Int).SetUint64(24)
 	e.Exp(e, k)
 	var _e big.Int
 	e.ToBigIntRegular(&_e)
@@ -415,3 +557,37 @@ func BenchmarkExpGT(b *testing.B) {
 		}
 	})
 }
+
+func BenchmarkMultiExpGT(b *testing.B) {
+	// ensure every words of the scalars are filled
+	var mixer fr.Element
+	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
+
+	const pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits
+	const nbSamples = 1 << pow
+
+	var samplePoints [nbSamples]GT
+	var sampleScalars [nbSamples]fr.Element
+	var _g GT
+	_g.SetRandom()
+
+	for i := 1; i <= nbSamples; i++ {
+		sampleScalars[i-1].SetUint64(uint64(i)).
+			Mul(&sampleScalars[i-1], &mixer).
+			FromMont()
+		samplePoints[i-1].Set(&_g)
+	}
+
+	var testPoint GT
+
+	for i := 5; i <= pow; i++ {
+		using := 1 << i
+
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
+	}
+}
diff --git a/ecc/bn254/internal/fptower/multiexp.go b/ecc/bn254/internal/fptower/multiexp.go
new file mode 100644
index 0000000000..41d1c06ec0
--- /dev/null
+++ b/ecc/bn254/internal/fptower/multiexp.go
@@ -0,0 +1,1229 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package fptower
+
+import (
+	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
+	"github.com/consensys/gnark-crypto/internal/parallel"
+	"math"
+	"runtime"
+)
+
+/* Multi-Exponentiation à la Pippenger */
+
+// selector stores the index, mask and shifts needed to select bits from a scalar
+// it is used during the multiExp algorithm or the batch scalar multiplication
+type selector struct {
+	index uint64 // index in the multi-word scalar to select bits from
+	mask  uint64 // mask (c-bit wide)
+	shift uint64 // shift needed to get our bits on low positions
+
+	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
+	maskHigh        uint64 // same than mask, for index+1
+	shiftHigh       uint64 // same than shift, for index+1
+}
+
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+	toReturn := make([]fr.Element, len(scalars))
+
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
+
+	mask := uint64((1 << c) - 1)      // low c bits are 1
+	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
+	max := int(1 << (c - 1))          // max value we want for our digits
+	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
+
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
+		}
+		selectors[chunk] = d
+	}
+
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
+
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
+
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.IsUint64() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
+
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
+
+				// init with carry if any
+				digit := carry
+				carry = 0
+
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
+
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
+
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
+
+				var bits uint64
+				if digit >= 0 {
+					bits = uint64(digit)
+				} else {
+					bits = uint64(-digit-1) | msbWindow
+				}
+
+				toReturn[i][s.index] |= (bits << s.shift)
+				if s.multiWordSelect {
+					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				}
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
+}
+
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+func (p *E12) MultiExp(points []E12, scalars []fr.Element, config ecc.MultiExpConfig) (*E12, error) {
+	// note:
+	// each of the MsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each MsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented MsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerE12 , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]E12, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerE12(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerE12(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.Mul(p, &_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerE12(p *E12, c int, points []E12, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.MsmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.MsmC5(points, scalars, splitFirstChunk)
+
+	case 6:
+		p.MsmC6(points, scalars, splitFirstChunk)
+
+	case 7:
+		p.MsmC7(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.MsmC8(points, scalars, splitFirstChunk)
+
+	case 9:
+		p.MsmC9(points, scalars, splitFirstChunk)
+
+	case 10:
+		p.MsmC10(points, scalars, splitFirstChunk)
+
+	case 11:
+		p.MsmC11(points, scalars, splitFirstChunk)
+
+	case 12:
+		p.MsmC12(points, scalars, splitFirstChunk)
+
+	case 13:
+		p.MsmC13(points, scalars, splitFirstChunk)
+
+	case 14:
+		p.MsmC14(points, scalars, splitFirstChunk)
+
+	case 15:
+		p.MsmC15(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.MsmC16(points, scalars, splitFirstChunk)
+
+	case 20:
+		p.MsmC20(points, scalars, splitFirstChunk)
+
+	case 21:
+		p.MsmC21(points, scalars, splitFirstChunk)
+
+	case 22:
+		p.MsmC22(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkE12 reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkE12(p *E12, c int, chChunks []chan E12) *E12 {
+	var _p E12
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.CyclotomicSquare(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.Mul(&_p, &totalj)
+	}
+
+	p.Set(&_p)
+	return p
+}
+
+func msmProcessChunkE12(chunk uint64,
+	chRes chan<- E12,
+	buckets []E12,
+	c uint64,
+	points []E12,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].SetOne()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		var tmp E12
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].Mul(&buckets[bits-1], &points[i])
+		} else {
+			// sub
+			tmp.Conjugate(&points[i])
+			buckets[bits & ^msbWindow].Mul(&buckets[bits & ^msbWindow], &tmp)
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var one, runningSum, total E12
+	runningSum.SetOne()
+	total.SetOne()
+	one.SetOne()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].Equal(&one) {
+			runningSum.Mul(&runningSum, &buckets[k])
+		}
+		total.Mul(&total, &runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *E12) MsmC4(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 4                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC5(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 5                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC6(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 6                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC7(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 7                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC8(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 8                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC9(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 9                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC10(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 10                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC11(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 11                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC12(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 12                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC13(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 13                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC14(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 14                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC15(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 15                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC16(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC20(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 20                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC21(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 21                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
+
+func (p *E12) MsmC22(points []E12, scalars []fr.Element, splitFirstChunk bool) *E12 {
+	const (
+		c        = 22                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E12
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E12, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E12, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E12
+		msmProcessChunkE12(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E12, scalars []fr.Element, chChunk chan E12) {
+		var buckets [1 << (c - 1)]E12
+		msmProcessChunkE12(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E12, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE12(p, c, chChunks[:])
+}
diff --git a/ecc/bn254/pairing_test.go b/ecc/bn254/pairing_test.go
index da33256083..ff4252106b 100644
--- a/ecc/bn254/pairing_test.go
+++ b/ecc/bn254/pairing_test.go
@@ -19,8 +19,11 @@ package bn254
 import (
 	"fmt"
 	"math/big"
+	"math/bits"
+	"runtime"
 	"testing"
 
+	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bn254/fp"
 	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
 	"github.com/leanovate/gopter"
@@ -284,6 +287,146 @@ func TestMillerLoop(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestMultiExpGT(t *testing.T) {
+
+	parameters := gopter.DefaultTestParameters()
+	parameters.MinSuccessfulTests = 2
+
+	properties := gopter.NewProperties(parameters)
+
+	genScalar := GenFr()
+
+	// size of the multiExps
+	const nbSamples = 143
+
+	// multi exp points
+	var samplePoints [nbSamples]GT
+	var _g, g GT
+	_g.SetRandom()
+
+	// put into GT
+	_g = FinalExponentiation(&_g)
+
+	g.Set(&_g)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].Set(&g)
+		g.Mul(&g, &_g)
+	}
+
+	// final scalar to use in double and add method (without mixer factor)
+	// n(n+1)(2n+1)/6  (sum of the squares from 1 to n)
+	var scalar big.Int
+	scalar.SetInt64(nbSamples)
+	scalar.Mul(&scalar, new(big.Int).SetInt64(nbSamples+1))
+	scalar.Mul(&scalar, new(big.Int).SetInt64(2*nbSamples+1))
+	scalar.Div(&scalar, new(big.Int).SetInt64(6))
+
+	// ensure a multiexp that's splitted has the same result as a non-splitted one..
+	properties.Property("[GT] Multi exponentation (c=16) should be consistant with splitted multiexp", prop.ForAll(
+		func(mixer fr.Element) bool {
+			var samplePointsLarge [nbSamples * 13]GT
+			for i := 0; i < 13; i++ {
+				copy(samplePointsLarge[i*nbSamples:], samplePoints[:])
+			}
+
+			var r16, splitted1, splitted2 GT
+
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples * 13]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
+			r16.MsmC16(samplePoints[:], scalars16, true)
+
+			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
+			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
+			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
+		},
+		genScalar,
+	))
+
+	// we test only c = 5 and c = 16
+	properties.Property("[GT] Multi exponentation (c=5, c=16) should be consistant with sum of square", prop.ForAll(
+		func(mixer fr.Element) bool {
+
+			var expected, g GT
+			g.SetRandom()
+			// put into GT
+			g = FinalExponentiation(&_g)
+
+			// compute expected result with double and add
+			var finalScalar, mixerBigInt big.Int
+			finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt))
+			expected.ExpGLV(_g, &finalScalar)
+
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			scalars5, _ := partitionScalars(sampleScalars[:], 5, false, runtime.NumCPU())
+			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
+
+			var r5, r16 GT
+			r5.MsmC5(samplePoints[:], scalars5, false)
+			r16.MsmC16(samplePoints[:], scalars16, true)
+			return (r5.Equal(&expected) && r16.Equal(&expected))
+		},
+		genScalar,
+	))
+
+	// note : this test is here as we expect to have a different multiExp than the above bucket method
+	// for small number of points
+	properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll(
+		func(mixer fr.Element) bool {
+
+			var _g, g GT
+			_g.SetRandom()
+
+			// put into GT
+			_g = FinalExponentiation(&_g)
+
+			g.Set(&_g)
+
+			// mixer ensures that all the words of a fpElement are set
+			samplePoints := make([]GT, 30)
+			sampleScalars := make([]fr.Element, 30)
+
+			for i := 1; i <= 30; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+				samplePoints[i-1].Set(&g)
+				g.Mul(&g, &_g)
+			}
+
+			var op1MultiExp GT
+			op1MultiExp.MultiExp(samplePoints, sampleScalars, ecc.MultiExpConfig{})
+
+			var finalBigScalar fr.Element
+			var finalBigScalarBi big.Int
+			var op1ScalarMul GT
+			finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer)
+			finalBigScalar.ToBigIntRegular(&finalBigScalarBi)
+			op1ScalarMul.ExpGLV(_g, &finalBigScalarBi)
+
+			return op1ScalarMul.Equal(&op1MultiExp)
+		},
+		genScalar,
+	))
+
+	properties.TestingRun(t, gopter.ConsoleReporter(false))
+}
+
 // ------------------------------------------------------------
 // benches
 
@@ -414,3 +557,37 @@ func BenchmarkExpGT(b *testing.B) {
 		}
 	})
 }
+
+func BenchmarkMultiExpGT(b *testing.B) {
+	// ensure every words of the scalars are filled
+	var mixer fr.Element
+	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
+
+	const pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits
+	const nbSamples = 1 << pow
+
+	var samplePoints [nbSamples]GT
+	var sampleScalars [nbSamples]fr.Element
+	var _g GT
+	_g.SetRandom()
+
+	for i := 1; i <= nbSamples; i++ {
+		sampleScalars[i-1].SetUint64(uint64(i)).
+			Mul(&sampleScalars[i-1], &mixer).
+			FromMont()
+		samplePoints[i-1].Set(&_g)
+	}
+
+	var testPoint GT
+
+	for i := 5; i <= pow; i++ {
+		using := 1 << i
+
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
+	}
+}
diff --git a/ecc/bw6-633/pairing_test.go b/ecc/bw6-633/pairing_test.go
index f8a0f84bbf..e46b0086c1 100644
--- a/ecc/bw6-633/pairing_test.go
+++ b/ecc/bw6-633/pairing_test.go
@@ -19,8 +19,11 @@ package bw6633
 import (
 	"fmt"
 	"math/big"
+	"math/bits"
+	"runtime"
 	"testing"
 
+	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fp"
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fr"
 	"github.com/leanovate/gopter"
@@ -43,7 +46,6 @@ func TestPairing(t *testing.T) {
 	properties := gopter.NewProperties(parameters)
 
 	genA := GenE6()
-
 	genR1 := GenFr()
 	genR2 := GenFr()
 	genP := GenFp()
@@ -73,7 +75,6 @@ func TestPairing(t *testing.T) {
 			var _e, ne big.Int
 
 			k := new(big.Int).SetUint64(6)
-
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
 			ne.Neg(&_e)
@@ -286,6 +287,146 @@ func TestMillerLoop(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestMultiExpGT(t *testing.T) {
+
+	parameters := gopter.DefaultTestParameters()
+	parameters.MinSuccessfulTests = 2
+
+	properties := gopter.NewProperties(parameters)
+
+	genScalar := GenFr()
+
+	// size of the multiExps
+	const nbSamples = 143
+
+	// multi exp points
+	var samplePoints [nbSamples]GT
+	var _g, g GT
+	_g.SetRandom()
+
+	// put into GT
+	_g = FinalExponentiation(&_g)
+
+	g.Set(&_g)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].Set(&g)
+		g.Mul(&g, &_g)
+	}
+
+	// final scalar to use in double and add method (without mixer factor)
+	// n(n+1)(2n+1)/6  (sum of the squares from 1 to n)
+	var scalar big.Int
+	scalar.SetInt64(nbSamples)
+	scalar.Mul(&scalar, new(big.Int).SetInt64(nbSamples+1))
+	scalar.Mul(&scalar, new(big.Int).SetInt64(2*nbSamples+1))
+	scalar.Div(&scalar, new(big.Int).SetInt64(6))
+
+	// ensure a multiexp that's splitted has the same result as a non-splitted one..
+	properties.Property("[GT] Multi exponentation (c=16) should be consistant with splitted multiexp", prop.ForAll(
+		func(mixer fr.Element) bool {
+			var samplePointsLarge [nbSamples * 13]GT
+			for i := 0; i < 13; i++ {
+				copy(samplePointsLarge[i*nbSamples:], samplePoints[:])
+			}
+
+			var r16, splitted1, splitted2 GT
+
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples * 13]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
+			r16.MsmC16(samplePoints[:], scalars16, true)
+
+			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
+			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
+			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
+		},
+		genScalar,
+	))
+
+	// we test only c = 5 and c = 16
+	properties.Property("[GT] Multi exponentation (c=5, c=16) should be consistant with sum of square", prop.ForAll(
+		func(mixer fr.Element) bool {
+
+			var expected, g GT
+			g.SetRandom()
+			// put into GT
+			g = FinalExponentiation(&_g)
+
+			// compute expected result with double and add
+			var finalScalar, mixerBigInt big.Int
+			finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt))
+			expected.ExpGLV(_g, &finalScalar)
+
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			scalars5, _ := partitionScalars(sampleScalars[:], 5, false, runtime.NumCPU())
+			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
+
+			var r5, r16 GT
+			r5.MsmC5(samplePoints[:], scalars5, false)
+			r16.MsmC16(samplePoints[:], scalars16, true)
+			return (r5.Equal(&expected) && r16.Equal(&expected))
+		},
+		genScalar,
+	))
+
+	// note : this test is here as we expect to have a different multiExp than the above bucket method
+	// for small number of points
+	properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll(
+		func(mixer fr.Element) bool {
+
+			var _g, g GT
+			_g.SetRandom()
+
+			// put into GT
+			_g = FinalExponentiation(&_g)
+
+			g.Set(&_g)
+
+			// mixer ensures that all the words of a fpElement are set
+			samplePoints := make([]GT, 30)
+			sampleScalars := make([]fr.Element, 30)
+
+			for i := 1; i <= 30; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+				samplePoints[i-1].Set(&g)
+				g.Mul(&g, &_g)
+			}
+
+			var op1MultiExp GT
+			op1MultiExp.MultiExp(samplePoints, sampleScalars, ecc.MultiExpConfig{})
+
+			var finalBigScalar fr.Element
+			var finalBigScalarBi big.Int
+			var op1ScalarMul GT
+			finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer)
+			finalBigScalar.ToBigIntRegular(&finalBigScalarBi)
+			op1ScalarMul.ExpGLV(_g, &finalBigScalarBi)
+
+			return op1ScalarMul.Equal(&op1MultiExp)
+		},
+		genScalar,
+	))
+
+	properties.TestingRun(t, gopter.ConsoleReporter(false))
+}
+
 // ------------------------------------------------------------
 // benches
 
@@ -391,7 +532,6 @@ func BenchmarkExpGT(b *testing.B) {
 	e.SetRandom()
 
 	k := new(big.Int).SetUint64(6)
-
 	e.Exp(e, k)
 	var _e big.Int
 	e.ToBigIntRegular(&_e)
@@ -417,3 +557,37 @@ func BenchmarkExpGT(b *testing.B) {
 		}
 	})
 }
+
+func BenchmarkMultiExpGT(b *testing.B) {
+	// ensure every words of the scalars are filled
+	var mixer fr.Element
+	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
+
+	const pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits
+	const nbSamples = 1 << pow
+
+	var samplePoints [nbSamples]GT
+	var sampleScalars [nbSamples]fr.Element
+	var _g GT
+	_g.SetRandom()
+
+	for i := 1; i <= nbSamples; i++ {
+		sampleScalars[i-1].SetUint64(uint64(i)).
+			Mul(&sampleScalars[i-1], &mixer).
+			FromMont()
+		samplePoints[i-1].Set(&_g)
+	}
+
+	var testPoint GT
+
+	for i := 5; i <= pow; i++ {
+		using := 1 << i
+
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
+	}
+}
diff --git a/ecc/bw6-756/internal/fptower/multiexp.go b/ecc/bw6-756/internal/fptower/multiexp.go
new file mode 100644
index 0000000000..9179829638
--- /dev/null
+++ b/ecc/bw6-756/internal/fptower/multiexp.go
@@ -0,0 +1,569 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package fptower
+
+import (
+	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bw6-756/fr"
+	"github.com/consensys/gnark-crypto/internal/parallel"
+	"math"
+	"runtime"
+)
+
+/* Multi-Exponentiation à la Pippenger */
+
+// selector stores the index, mask and shifts needed to select bits from a scalar
+// it is used during the multiExp algorithm or the batch scalar multiplication
+type selector struct {
+	index uint64 // index in the multi-word scalar to select bits from
+	mask  uint64 // mask (c-bit wide)
+	shift uint64 // shift needed to get our bits on low positions
+
+	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
+	maskHigh        uint64 // same than mask, for index+1
+	shiftHigh       uint64 // same than shift, for index+1
+}
+
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+	toReturn := make([]fr.Element, len(scalars))
+
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
+
+	mask := uint64((1 << c) - 1)      // low c bits are 1
+	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
+	max := int(1 << (c - 1))          // max value we want for our digits
+	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
+
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
+		}
+		selectors[chunk] = d
+	}
+
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
+
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
+
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.IsUint64() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
+
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
+
+				// init with carry if any
+				digit := carry
+				carry = 0
+
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
+
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
+
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
+
+				var bits uint64
+				if digit >= 0 {
+					bits = uint64(digit)
+				} else {
+					bits = uint64(-digit-1) | msbWindow
+				}
+
+				toReturn[i][s.index] |= (bits << s.shift)
+				if s.multiWordSelect {
+					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				}
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
+}
+
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+func (p *E6) MultiExp(points []E6, scalars []fr.Element, config ecc.MultiExpConfig) (*E6, error) {
+	// note:
+	// each of the MsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each MsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented MsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 8, 16}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerE6 , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]E6, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerE6(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerE6(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.Mul(p, &_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerE6(p *E6, c int, points []E6, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.MsmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.MsmC5(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.MsmC8(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.MsmC16(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkE6 reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkE6(p *E6, c int, chChunks []chan E6) *E6 {
+	var _p E6
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.CyclotomicSquare(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.Mul(&_p, &totalj)
+	}
+
+	p.Set(&_p)
+	return p
+}
+
+func msmProcessChunkE6(chunk uint64,
+	chRes chan<- E6,
+	buckets []E6,
+	c uint64,
+	points []E6,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].SetOne()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		var tmp E6
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].Mul(&buckets[bits-1], &points[i])
+		} else {
+			// sub
+			tmp.Conjugate(&points[i])
+			buckets[bits & ^msbWindow].Mul(&buckets[bits & ^msbWindow], &tmp)
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var one, runningSum, total E6
+	runningSum.SetOne()
+	total.SetOne()
+	one.SetOne()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].Equal(&one) {
+			runningSum.Mul(&runningSum, &buckets[k])
+		}
+		total.Mul(&total, &runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *E6) MsmC4(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 {
+	const (
+		c        = 4                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E6
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E6, 1)
+	}
+
+	processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) {
+		var buckets [1 << (c - 1)]E6
+		msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E6, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE6(p, c, chChunks[:])
+}
+
+func (p *E6) MsmC5(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 {
+	const (
+		c        = 5                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E6
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E6, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E6, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E6
+		msmProcessChunkE6(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) {
+		var buckets [1 << (c - 1)]E6
+		msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E6, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE6(p, c, chChunks[:])
+}
+
+func (p *E6) MsmC8(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 {
+	const (
+		c        = 8                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E6
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E6, 1)
+	}
+
+	processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) {
+		var buckets [1 << (c - 1)]E6
+		msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E6, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE6(p, c, chChunks[:])
+}
+
+func (p *E6) MsmC16(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E6
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E6, 1)
+	}
+
+	processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) {
+		var buckets [1 << (c - 1)]E6
+		msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E6, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE6(p, c, chChunks[:])
+}
diff --git a/ecc/bw6-756/pairing_test.go b/ecc/bw6-756/pairing_test.go
index 5c111bbd45..bf3cdcc402 100644
--- a/ecc/bw6-756/pairing_test.go
+++ b/ecc/bw6-756/pairing_test.go
@@ -19,8 +19,11 @@ package bw6756
 import (
 	"fmt"
 	"math/big"
+	"math/bits"
+	"runtime"
 	"testing"
 
+	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-756/fp"
 	"github.com/consensys/gnark-crypto/ecc/bw6-756/fr"
 	"github.com/leanovate/gopter"
@@ -43,7 +46,6 @@ func TestPairing(t *testing.T) {
 	properties := gopter.NewProperties(parameters)
 
 	genA := GenE6()
-
 	genR1 := GenFr()
 	genR2 := GenFr()
 	genP := GenFp()
@@ -72,7 +74,7 @@ func TestPairing(t *testing.T) {
 
 			var _e, ne big.Int
 
-			k := new(big.Int).SetUint64(12)
+			k := new(big.Int).SetUint64(6)
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
 			ne.Neg(&_e)
@@ -285,6 +287,146 @@ func TestMillerLoop(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestMultiExpGT(t *testing.T) {
+
+	parameters := gopter.DefaultTestParameters()
+	parameters.MinSuccessfulTests = 2
+
+	properties := gopter.NewProperties(parameters)
+
+	genScalar := GenFr()
+
+	// size of the multiExps
+	const nbSamples = 143
+
+	// multi exp points
+	var samplePoints [nbSamples]GT
+	var _g, g GT
+	_g.SetRandom()
+
+	// put into GT
+	_g = FinalExponentiation(&_g)
+
+	g.Set(&_g)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].Set(&g)
+		g.Mul(&g, &_g)
+	}
+
+	// final scalar to use in double and add method (without mixer factor)
+	// n(n+1)(2n+1)/6  (sum of the squares from 1 to n)
+	var scalar big.Int
+	scalar.SetInt64(nbSamples)
+	scalar.Mul(&scalar, new(big.Int).SetInt64(nbSamples+1))
+	scalar.Mul(&scalar, new(big.Int).SetInt64(2*nbSamples+1))
+	scalar.Div(&scalar, new(big.Int).SetInt64(6))
+
+	// ensure a multiexp that's splitted has the same result as a non-splitted one..
+	properties.Property("[GT] Multi exponentation (c=16) should be consistant with splitted multiexp", prop.ForAll(
+		func(mixer fr.Element) bool {
+			var samplePointsLarge [nbSamples * 13]GT
+			for i := 0; i < 13; i++ {
+				copy(samplePointsLarge[i*nbSamples:], samplePoints[:])
+			}
+
+			var r16, splitted1, splitted2 GT
+
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples * 13]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
+			r16.MsmC16(samplePoints[:], scalars16, true)
+
+			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
+			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
+			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
+		},
+		genScalar,
+	))
+
+	// we test only c = 5 and c = 16
+	properties.Property("[GT] Multi exponentation (c=5, c=16) should be consistant with sum of square", prop.ForAll(
+		func(mixer fr.Element) bool {
+
+			var expected, g GT
+			g.SetRandom()
+			// put into GT
+			g = FinalExponentiation(&_g)
+
+			// compute expected result with double and add
+			var finalScalar, mixerBigInt big.Int
+			finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt))
+			expected.ExpGLV(_g, &finalScalar)
+
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			scalars5, _ := partitionScalars(sampleScalars[:], 5, false, runtime.NumCPU())
+			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
+
+			var r5, r16 GT
+			r5.MsmC5(samplePoints[:], scalars5, false)
+			r16.MsmC16(samplePoints[:], scalars16, true)
+			return (r5.Equal(&expected) && r16.Equal(&expected))
+		},
+		genScalar,
+	))
+
+	// note : this test is here as we expect to have a different multiExp than the above bucket method
+	// for small number of points
+	properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll(
+		func(mixer fr.Element) bool {
+
+			var _g, g GT
+			_g.SetRandom()
+
+			// put into GT
+			_g = FinalExponentiation(&_g)
+
+			g.Set(&_g)
+
+			// mixer ensures that all the words of a fpElement are set
+			samplePoints := make([]GT, 30)
+			sampleScalars := make([]fr.Element, 30)
+
+			for i := 1; i <= 30; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+				samplePoints[i-1].Set(&g)
+				g.Mul(&g, &_g)
+			}
+
+			var op1MultiExp GT
+			op1MultiExp.MultiExp(samplePoints, sampleScalars, ecc.MultiExpConfig{})
+
+			var finalBigScalar fr.Element
+			var finalBigScalarBi big.Int
+			var op1ScalarMul GT
+			finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer)
+			finalBigScalar.ToBigIntRegular(&finalBigScalarBi)
+			op1ScalarMul.ExpGLV(_g, &finalBigScalarBi)
+
+			return op1ScalarMul.Equal(&op1MultiExp)
+		},
+		genScalar,
+	))
+
+	properties.TestingRun(t, gopter.ConsoleReporter(false))
+}
+
 // ------------------------------------------------------------
 // benches
 
@@ -389,7 +531,7 @@ func BenchmarkExpGT(b *testing.B) {
 	var e fp.Element
 	e.SetRandom()
 
-	k := new(big.Int).SetUint64(12)
+	k := new(big.Int).SetUint64(6)
 	e.Exp(e, k)
 	var _e big.Int
 	e.ToBigIntRegular(&_e)
@@ -415,3 +557,37 @@ func BenchmarkExpGT(b *testing.B) {
 		}
 	})
 }
+
+func BenchmarkMultiExpGT(b *testing.B) {
+	// ensure every words of the scalars are filled
+	var mixer fr.Element
+	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
+
+	const pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits
+	const nbSamples = 1 << pow
+
+	var samplePoints [nbSamples]GT
+	var sampleScalars [nbSamples]fr.Element
+	var _g GT
+	_g.SetRandom()
+
+	for i := 1; i <= nbSamples; i++ {
+		sampleScalars[i-1].SetUint64(uint64(i)).
+			Mul(&sampleScalars[i-1], &mixer).
+			FromMont()
+		samplePoints[i-1].Set(&_g)
+	}
+
+	var testPoint GT
+
+	for i := 5; i <= pow; i++ {
+		using := 1 << i
+
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
+	}
+}
diff --git a/ecc/bw6-761/internal/fptower/multiexp.go b/ecc/bw6-761/internal/fptower/multiexp.go
new file mode 100644
index 0000000000..97216f80f9
--- /dev/null
+++ b/ecc/bw6-761/internal/fptower/multiexp.go
@@ -0,0 +1,569 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package fptower
+
+import (
+	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bw6-761/fr"
+	"github.com/consensys/gnark-crypto/internal/parallel"
+	"math"
+	"runtime"
+)
+
+/* Multi-Exponentiation à la Pippenger */
+
+// selector stores the index, mask and shifts needed to select bits from a scalar
+// it is used during the multiExp algorithm or the batch scalar multiplication
+type selector struct {
+	index uint64 // index in the multi-word scalar to select bits from
+	mask  uint64 // mask (c-bit wide)
+	shift uint64 // shift needed to get our bits on low positions
+
+	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
+	maskHigh        uint64 // same than mask, for index+1
+	shiftHigh       uint64 // same than shift, for index+1
+}
+
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+	toReturn := make([]fr.Element, len(scalars))
+
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
+
+	mask := uint64((1 << c) - 1)      // low c bits are 1
+	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
+	max := int(1 << (c - 1))          // max value we want for our digits
+	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
+
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
+		}
+		selectors[chunk] = d
+	}
+
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
+
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
+
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.IsUint64() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
+
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
+
+				// init with carry if any
+				digit := carry
+				carry = 0
+
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
+
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
+
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
+
+				var bits uint64
+				if digit >= 0 {
+					bits = uint64(digit)
+				} else {
+					bits = uint64(-digit-1) | msbWindow
+				}
+
+				toReturn[i][s.index] |= (bits << s.shift)
+				if s.multiWordSelect {
+					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				}
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
+}
+
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+func (p *E6) MultiExp(points []E6, scalars []fr.Element, config ecc.MultiExpConfig) (*E6, error) {
+	// note:
+	// each of the MsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each MsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented MsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 8, 16}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerE6 , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]E6, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerE6(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerE6(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.Mul(p, &_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerE6(p *E6, c int, points []E6, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.MsmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.MsmC5(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.MsmC8(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.MsmC16(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkE6 reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkE6(p *E6, c int, chChunks []chan E6) *E6 {
+	var _p E6
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.CyclotomicSquare(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.Mul(&_p, &totalj)
+	}
+
+	p.Set(&_p)
+	return p
+}
+
+func msmProcessChunkE6(chunk uint64,
+	chRes chan<- E6,
+	buckets []E6,
+	c uint64,
+	points []E6,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].SetOne()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		var tmp E6
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].Mul(&buckets[bits-1], &points[i])
+		} else {
+			// sub
+			tmp.Conjugate(&points[i])
+			buckets[bits & ^msbWindow].Mul(&buckets[bits & ^msbWindow], &tmp)
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var one, runningSum, total E6
+	runningSum.SetOne()
+	total.SetOne()
+	one.SetOne()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].Equal(&one) {
+			runningSum.Mul(&runningSum, &buckets[k])
+		}
+		total.Mul(&total, &runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *E6) MsmC4(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 {
+	const (
+		c        = 4                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E6
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E6, 1)
+	}
+
+	processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) {
+		var buckets [1 << (c - 1)]E6
+		msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E6, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE6(p, c, chChunks[:])
+}
+
+func (p *E6) MsmC5(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 {
+	const (
+		c        = 5                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan E6
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E6, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	go func(j uint64, points []E6, scalars []fr.Element) {
+		var buckets [1 << (lastC - 1)]E6
+		msmProcessChunkE6(j, chChunks[j], buckets[:], c, points, scalars)
+	}(uint64(nbChunks), points, scalars)
+
+	processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) {
+		var buckets [1 << (c - 1)]E6
+		msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E6, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE6(p, c, chChunks[:])
+}
+
+func (p *E6) MsmC8(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 {
+	const (
+		c        = 8                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E6
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E6, 1)
+	}
+
+	processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) {
+		var buckets [1 << (c - 1)]E6
+		msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E6, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE6(p, c, chChunks[:])
+}
+
+func (p *E6) MsmC16(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E6
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E6, 1)
+	}
+
+	processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) {
+		var buckets [1 << (c - 1)]E6
+		msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E6, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE6(p, c, chChunks[:])
+}
diff --git a/ecc/bw6-761/pairing_test.go b/ecc/bw6-761/pairing_test.go
index 76bf81eb3a..436e6ed4d5 100644
--- a/ecc/bw6-761/pairing_test.go
+++ b/ecc/bw6-761/pairing_test.go
@@ -19,8 +19,11 @@ package bw6761
 import (
 	"fmt"
 	"math/big"
+	"math/bits"
+	"runtime"
 	"testing"
 
+	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fp"
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fr"
 	"github.com/leanovate/gopter"
@@ -43,7 +46,6 @@ func TestPairing(t *testing.T) {
 	properties := gopter.NewProperties(parameters)
 
 	genA := GenE6()
-
 	genR1 := GenFr()
 	genR2 := GenFr()
 	genP := GenFp()
@@ -73,7 +75,6 @@ func TestPairing(t *testing.T) {
 			var _e, ne big.Int
 
 			k := new(big.Int).SetUint64(6)
-
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
 			ne.Neg(&_e)
@@ -286,6 +287,146 @@ func TestMillerLoop(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestMultiExpGT(t *testing.T) {
+
+	parameters := gopter.DefaultTestParameters()
+	parameters.MinSuccessfulTests = 2
+
+	properties := gopter.NewProperties(parameters)
+
+	genScalar := GenFr()
+
+	// size of the multiExps
+	const nbSamples = 143
+
+	// multi exp points
+	var samplePoints [nbSamples]GT
+	var _g, g GT
+	_g.SetRandom()
+
+	// put into GT
+	_g = FinalExponentiation(&_g)
+
+	g.Set(&_g)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].Set(&g)
+		g.Mul(&g, &_g)
+	}
+
+	// final scalar to use in double and add method (without mixer factor)
+	// n(n+1)(2n+1)/6  (sum of the squares from 1 to n)
+	var scalar big.Int
+	scalar.SetInt64(nbSamples)
+	scalar.Mul(&scalar, new(big.Int).SetInt64(nbSamples+1))
+	scalar.Mul(&scalar, new(big.Int).SetInt64(2*nbSamples+1))
+	scalar.Div(&scalar, new(big.Int).SetInt64(6))
+
+	// ensure a multiexp that's splitted has the same result as a non-splitted one..
+	properties.Property("[GT] Multi exponentation (c=16) should be consistant with splitted multiexp", prop.ForAll(
+		func(mixer fr.Element) bool {
+			var samplePointsLarge [nbSamples * 13]GT
+			for i := 0; i < 13; i++ {
+				copy(samplePointsLarge[i*nbSamples:], samplePoints[:])
+			}
+
+			var r16, splitted1, splitted2 GT
+
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples * 13]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
+			r16.MsmC16(samplePoints[:], scalars16, true)
+
+			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
+			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
+			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
+		},
+		genScalar,
+	))
+
+	// we test only c = 5 and c = 16
+	properties.Property("[GT] Multi exponentation (c=5, c=16) should be consistant with sum of square", prop.ForAll(
+		func(mixer fr.Element) bool {
+
+			var expected, g GT
+			g.SetRandom()
+			// put into GT
+			g = FinalExponentiation(&_g)
+
+			// compute expected result with double and add
+			var finalScalar, mixerBigInt big.Int
+			finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt))
+			expected.ExpGLV(_g, &finalScalar)
+
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			scalars5, _ := partitionScalars(sampleScalars[:], 5, false, runtime.NumCPU())
+			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
+
+			var r5, r16 GT
+			r5.MsmC5(samplePoints[:], scalars5, false)
+			r16.MsmC16(samplePoints[:], scalars16, true)
+			return (r5.Equal(&expected) && r16.Equal(&expected))
+		},
+		genScalar,
+	))
+
+	// note : this test is here as we expect to have a different multiExp than the above bucket method
+	// for small number of points
+	properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll(
+		func(mixer fr.Element) bool {
+
+			var _g, g GT
+			_g.SetRandom()
+
+			// put into GT
+			_g = FinalExponentiation(&_g)
+
+			g.Set(&_g)
+
+			// mixer ensures that all the words of a fpElement are set
+			samplePoints := make([]GT, 30)
+			sampleScalars := make([]fr.Element, 30)
+
+			for i := 1; i <= 30; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+				samplePoints[i-1].Set(&g)
+				g.Mul(&g, &_g)
+			}
+
+			var op1MultiExp GT
+			op1MultiExp.MultiExp(samplePoints, sampleScalars, ecc.MultiExpConfig{})
+
+			var finalBigScalar fr.Element
+			var finalBigScalarBi big.Int
+			var op1ScalarMul GT
+			finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer)
+			finalBigScalar.ToBigIntRegular(&finalBigScalarBi)
+			op1ScalarMul.ExpGLV(_g, &finalBigScalarBi)
+
+			return op1ScalarMul.Equal(&op1MultiExp)
+		},
+		genScalar,
+	))
+
+	properties.TestingRun(t, gopter.ConsoleReporter(false))
+}
+
 // ------------------------------------------------------------
 // benches
 
@@ -391,7 +532,6 @@ func BenchmarkExpGT(b *testing.B) {
 	e.SetRandom()
 
 	k := new(big.Int).SetUint64(6)
-
 	e.Exp(e, k)
 	var _e big.Int
 	e.ToBigIntRegular(&_e)
@@ -417,3 +557,37 @@ func BenchmarkExpGT(b *testing.B) {
 		}
 	})
 }
+
+func BenchmarkMultiExpGT(b *testing.B) {
+	// ensure every words of the scalars are filled
+	var mixer fr.Element
+	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
+
+	const pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits
+	const nbSamples = 1 << pow
+
+	var samplePoints [nbSamples]GT
+	var sampleScalars [nbSamples]fr.Element
+	var _g GT
+	_g.SetRandom()
+
+	for i := 1; i <= nbSamples; i++ {
+		sampleScalars[i-1].SetUint64(uint64(i)).
+			Mul(&sampleScalars[i-1], &mixer).
+			FromMont()
+		samplePoints[i-1].Set(&_g)
+	}
+
+	var testPoint GT
+
+	for i := 5; i <= pow; i++ {
+		using := 1 << i
+
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
+	}
+}

From 8ad6473f4dad12dae00298de3994a234989f6122 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Mon, 20 Jun 2022 18:14:55 +0100
Subject: [PATCH 15/16] refactor: code generation of MultiExp test in
 pairing_test

---
 ecc/bls12-378/pairing_test.go                 |   5 +-
 ecc/bls24-315/pairing_test.go                 |   5 +-
 ecc/bls24-317/pairing_test.go                 |   3 +-
 ecc/bw6-633/pairing_test.go                   |   3 +
 ecc/bw6-756/pairing_test.go                   |   4 +-
 ecc/bw6-761/pairing_test.go                   |   3 +
 .../pairing/template/tests/pairing.go.tmpl    | 181 +++++++++++++++++-
 7 files changed, 197 insertions(+), 7 deletions(-)

diff --git a/ecc/bls12-378/pairing_test.go b/ecc/bls12-378/pairing_test.go
index 0bae6e1e41..90bc6ab07f 100644
--- a/ecc/bls12-378/pairing_test.go
+++ b/ecc/bls12-378/pairing_test.go
@@ -286,6 +286,7 @@ func TestMillerLoop(t *testing.T) {
 
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
+
 func TestMultiExpGT(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
@@ -361,7 +362,7 @@ func TestMultiExpGT(t *testing.T) {
 			// compute expected result with double and add
 			var finalScalar, mixerBigInt big.Int
 			finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt))
-			expected.ExpGLV(&_g, &finalScalar)
+			expected.ExpGLV(_g, &finalScalar)
 
 			// mixer ensures that all the words of a fpElement are set
 			var sampleScalars [nbSamples]fr.Element
@@ -416,7 +417,7 @@ func TestMultiExpGT(t *testing.T) {
 			var op1ScalarMul GT
 			finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer)
 			finalBigScalar.ToBigIntRegular(&finalBigScalarBi)
-			op1ScalarMul.ExpGLV(&_g, &finalBigScalarBi)
+			op1ScalarMul.ExpGLV(_g, &finalBigScalarBi)
 
 			return op1ScalarMul.Equal(&op1MultiExp)
 		},
diff --git a/ecc/bls24-315/pairing_test.go b/ecc/bls24-315/pairing_test.go
index 1663205050..d14bb6e4c7 100644
--- a/ecc/bls24-315/pairing_test.go
+++ b/ecc/bls24-315/pairing_test.go
@@ -46,6 +46,7 @@ func TestPairing(t *testing.T) {
 	properties := gopter.NewProperties(parameters)
 
 	genA := GenE24()
+
 	genR1 := GenFr()
 	genR2 := GenFr()
 	genP := GenFp()
@@ -75,6 +76,7 @@ func TestPairing(t *testing.T) {
 			var _e, ne big.Int
 
 			k := new(big.Int).SetUint64(24)
+
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
 			ne.Neg(&_e)
@@ -531,7 +533,8 @@ func BenchmarkExpGT(b *testing.B) {
 	var e fp.Element
 	e.SetRandom()
 
-	k := new(big.Int).SetUint64(12)
+	k := new(big.Int).SetUint64(24)
+
 	e.Exp(e, k)
 	var _e big.Int
 	e.ToBigIntRegular(&_e)
diff --git a/ecc/bls24-317/pairing_test.go b/ecc/bls24-317/pairing_test.go
index 55b48fe256..3889f1e6a2 100644
--- a/ecc/bls24-317/pairing_test.go
+++ b/ecc/bls24-317/pairing_test.go
@@ -46,6 +46,7 @@ func TestPairing(t *testing.T) {
 	properties := gopter.NewProperties(parameters)
 
 	genA := GenE24()
+
 	genR1 := GenFr()
 	genR2 := GenFr()
 	genP := GenFp()
@@ -531,7 +532,7 @@ func BenchmarkExpGT(b *testing.B) {
 	var e fp.Element
 	e.SetRandom()
 
-	k := new(big.Int).SetUint64(24)
+	k := new(big.Int).SetUint64(12)
 	e.Exp(e, k)
 	var _e big.Int
 	e.ToBigIntRegular(&_e)
diff --git a/ecc/bw6-633/pairing_test.go b/ecc/bw6-633/pairing_test.go
index e46b0086c1..6c39eb3be6 100644
--- a/ecc/bw6-633/pairing_test.go
+++ b/ecc/bw6-633/pairing_test.go
@@ -46,6 +46,7 @@ func TestPairing(t *testing.T) {
 	properties := gopter.NewProperties(parameters)
 
 	genA := GenE6()
+
 	genR1 := GenFr()
 	genR2 := GenFr()
 	genP := GenFp()
@@ -75,6 +76,7 @@ func TestPairing(t *testing.T) {
 			var _e, ne big.Int
 
 			k := new(big.Int).SetUint64(6)
+
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
 			ne.Neg(&_e)
@@ -532,6 +534,7 @@ func BenchmarkExpGT(b *testing.B) {
 	e.SetRandom()
 
 	k := new(big.Int).SetUint64(6)
+
 	e.Exp(e, k)
 	var _e big.Int
 	e.ToBigIntRegular(&_e)
diff --git a/ecc/bw6-756/pairing_test.go b/ecc/bw6-756/pairing_test.go
index bf3cdcc402..ae9dd418b3 100644
--- a/ecc/bw6-756/pairing_test.go
+++ b/ecc/bw6-756/pairing_test.go
@@ -46,6 +46,7 @@ func TestPairing(t *testing.T) {
 	properties := gopter.NewProperties(parameters)
 
 	genA := GenE6()
+
 	genR1 := GenFr()
 	genR2 := GenFr()
 	genP := GenFp()
@@ -74,7 +75,7 @@ func TestPairing(t *testing.T) {
 
 			var _e, ne big.Int
 
-			k := new(big.Int).SetUint64(6)
+			k := new(big.Int).SetUint64(12)
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
 			ne.Neg(&_e)
@@ -532,6 +533,7 @@ func BenchmarkExpGT(b *testing.B) {
 	e.SetRandom()
 
 	k := new(big.Int).SetUint64(6)
+
 	e.Exp(e, k)
 	var _e big.Int
 	e.ToBigIntRegular(&_e)
diff --git a/ecc/bw6-761/pairing_test.go b/ecc/bw6-761/pairing_test.go
index 436e6ed4d5..6c0219ba3f 100644
--- a/ecc/bw6-761/pairing_test.go
+++ b/ecc/bw6-761/pairing_test.go
@@ -46,6 +46,7 @@ func TestPairing(t *testing.T) {
 	properties := gopter.NewProperties(parameters)
 
 	genA := GenE6()
+
 	genR1 := GenFr()
 	genR2 := GenFr()
 	genP := GenFp()
@@ -75,6 +76,7 @@ func TestPairing(t *testing.T) {
 			var _e, ne big.Int
 
 			k := new(big.Int).SetUint64(6)
+
 			e.Exp(e, k)
 			e.ToBigIntRegular(&_e)
 			ne.Neg(&_e)
@@ -532,6 +534,7 @@ func BenchmarkExpGT(b *testing.B) {
 	e.SetRandom()
 
 	k := new(big.Int).SetUint64(6)
+
 	e.Exp(e, k)
 	var _e big.Int
 	e.ToBigIntRegular(&_e)
diff --git a/internal/generator/pairing/template/tests/pairing.go.tmpl b/internal/generator/pairing/template/tests/pairing.go.tmpl
index a8209b7ac2..3a020e2889 100644
--- a/internal/generator/pairing/template/tests/pairing.go.tmpl
+++ b/internal/generator/pairing/template/tests/pairing.go.tmpl
@@ -1,8 +1,11 @@
 import (
     "fmt"
 	"math/big"
+	"math/bits"
+	"runtime"
 	"testing"
 
+	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/fr"
 	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/fp"
     "github.com/leanovate/gopter"
@@ -286,6 +289,146 @@ func TestMillerLoop(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestMultiExpGT(t *testing.T) {
+
+	parameters := gopter.DefaultTestParameters()
+	parameters.MinSuccessfulTests = 2
+
+	properties := gopter.NewProperties(parameters)
+
+	genScalar := GenFr()
+
+	// size of the multiExps
+	const nbSamples = 143
+
+	// multi exp points
+	var samplePoints [nbSamples]GT
+	var _g, g GT
+	_g.SetRandom()
+
+	// put into GT
+	_g = FinalExponentiation(&_g)
+
+	g.Set(&_g)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].Set(&g)
+		g.Mul(&g, &_g)
+	}
+
+	// final scalar to use in double and add method (without mixer factor)
+	// n(n+1)(2n+1)/6  (sum of the squares from 1 to n)
+	var scalar big.Int
+	scalar.SetInt64(nbSamples)
+	scalar.Mul(&scalar, new(big.Int).SetInt64(nbSamples+1))
+	scalar.Mul(&scalar, new(big.Int).SetInt64(2*nbSamples+1))
+	scalar.Div(&scalar, new(big.Int).SetInt64(6))
+
+	// ensure a multiexp that's splitted has the same result as a non-splitted one..
+	properties.Property("[GT] Multi exponentation (c=16) should be consistant with splitted multiexp", prop.ForAll(
+		func(mixer fr.Element) bool {
+			var samplePointsLarge [nbSamples * 13]GT
+			for i := 0; i < 13; i++ {
+				copy(samplePointsLarge[i*nbSamples:], samplePoints[:])
+			}
+
+			var r16, splitted1, splitted2 GT
+
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples * 13]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
+			r16.MsmC16(samplePoints[:], scalars16, true)
+
+			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
+			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
+			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
+		},
+		genScalar,
+	))
+
+	// we test only c = 5 and c = 16
+	properties.Property("[GT] Multi exponentation (c=5, c=16) should be consistant with sum of square", prop.ForAll(
+		func(mixer fr.Element) bool {
+
+			var expected, g GT
+			g.SetRandom()
+			// put into GT
+			g = FinalExponentiation(&_g)
+
+			// compute expected result with double and add
+			var finalScalar, mixerBigInt big.Int
+			finalScalar.Mul(&scalar, mixer.ToBigIntRegular(&mixerBigInt))
+			expected.ExpGLV(_g, &finalScalar)
+
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			scalars5, _ := partitionScalars(sampleScalars[:], 5, false, runtime.NumCPU())
+			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
+
+			var r5, r16 GT
+			r5.MsmC5(samplePoints[:], scalars5, false)
+			r16.MsmC16(samplePoints[:], scalars16, true)
+			return (r5.Equal(&expected) && r16.Equal(&expected))
+		},
+		genScalar,
+	))
+
+	// note : this test is here as we expect to have a different multiExp than the above bucket method
+	// for small number of points
+	properties.Property("[GT] Multi exponentation (<50points) should be consistant with sum of square", prop.ForAll(
+		func(mixer fr.Element) bool {
+
+			var _g, g GT
+			_g.SetRandom()
+
+			// put into GT
+			_g = FinalExponentiation(&_g)
+
+			g.Set(&_g)
+
+			// mixer ensures that all the words of a fpElement are set
+			samplePoints := make([]GT, 30)
+			sampleScalars := make([]fr.Element, 30)
+
+			for i := 1; i <= 30; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+				samplePoints[i-1].Set(&g)
+				g.Mul(&g, &_g)
+			}
+
+			var op1MultiExp GT
+			op1MultiExp.MultiExp(samplePoints, sampleScalars, ecc.MultiExpConfig{})
+
+			var finalBigScalar fr.Element
+			var finalBigScalarBi big.Int
+			var op1ScalarMul GT
+			finalBigScalar.SetString("9455").Mul(&finalBigScalar, &mixer)
+			finalBigScalar.ToBigIntRegular(&finalBigScalarBi)
+			op1ScalarMul.ExpGLV(_g, &finalBigScalarBi)
+
+			return op1ScalarMul.Equal(&op1MultiExp)
+		},
+		genScalar,
+	))
+
+	properties.TestingRun(t, gopter.ConsoleReporter(false))
+}
+
 
 // ------------------------------------------------------------
 // benches
@@ -390,9 +533,9 @@ func BenchmarkExpGT(b *testing.B) {
 
 	var e fp.Element
 	e.SetRandom()
-    {{if or (eq .Name "bw6-761") (eq .Name "bw6-633")}}
+    {{if or (eq .Name "bw6-761") (eq .Name "bw6-633") (eq .Name "bw6-756")}}
     k := new(big.Int).SetUint64(6)
-    {{else if eq .Name "bls24-315"}}
+    {{else if or (eq .Name "bls24-315") (eq .Name "bls24-315")}}
     k := new(big.Int).SetUint64(24)
     {{ else }}
     k := new(big.Int).SetUint64(12)
@@ -422,3 +565,37 @@ func BenchmarkExpGT(b *testing.B) {
 		}
 	})
 }
+
+func BenchmarkMultiExpGT(b *testing.B) {
+	// ensure every words of the scalars are filled
+	var mixer fr.Element
+	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
+
+	const pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits
+	const nbSamples = 1 << pow
+
+	var samplePoints [nbSamples]GT
+	var sampleScalars [nbSamples]fr.Element
+	var _g GT
+	_g.SetRandom()
+
+	for i := 1; i <= nbSamples; i++ {
+		sampleScalars[i-1].SetUint64(uint64(i)).
+			Mul(&sampleScalars[i-1], &mixer).
+			FromMont()
+		samplePoints[i-1].Set(&_g)
+	}
+
+	var testPoint GT
+
+	for i := 5; i <= pow; i++ {
+		using := 1 << i
+
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
+	}
+}

From d890c8d62588cd374e3f77f8cc01ca8d43d92b51 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Tue, 21 Jun 2022 10:00:52 +0100
Subject: [PATCH 16/16] fix(bw6-633/GT): GT-MSM with widown size 5

---
 ecc/bw6-633/internal/fptower/multiexp.go | 562 +++++++++++++++++++++++
 ecc/bw6-756/internal/fptower/multiexp.go |   2 +-
 ecc/bw6-761/internal/fptower/multiexp.go |   2 +-
 3 files changed, 564 insertions(+), 2 deletions(-)
 create mode 100644 ecc/bw6-633/internal/fptower/multiexp.go

diff --git a/ecc/bw6-633/internal/fptower/multiexp.go b/ecc/bw6-633/internal/fptower/multiexp.go
new file mode 100644
index 0000000000..694a74dc47
--- /dev/null
+++ b/ecc/bw6-633/internal/fptower/multiexp.go
@@ -0,0 +1,562 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package fptower
+
+import (
+	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bw6-633/fr"
+	"github.com/consensys/gnark-crypto/internal/parallel"
+	"math"
+	"runtime"
+)
+
+/* Multi-Exponentiation à la Pippenger */
+
+// selector stores the index, mask and shifts needed to select bits from a scalar
+// it is used during the multiExp algorithm or the batch scalar multiplication
+type selector struct {
+	index uint64 // index in the multi-word scalar to select bits from
+	mask  uint64 // mask (c-bit wide)
+	shift uint64 // shift needed to get our bits on low positions
+
+	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
+	maskHigh        uint64 // same than mask, for index+1
+	shiftHigh       uint64 // same than shift, for index+1
+}
+
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMul)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+	toReturn := make([]fr.Element, len(scalars))
+
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
+
+	mask := uint64((1 << c) - 1)      // low c bits are 1
+	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
+	max := int(1 << (c - 1))          // max value we want for our digits
+	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
+
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
+		}
+		selectors[chunk] = d
+	}
+
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
+
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
+
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.IsUint64() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
+
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
+
+				// init with carry if any
+				digit := carry
+				carry = 0
+
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
+
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
+
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
+
+				var bits uint64
+				if digit >= 0 {
+					bits = uint64(digit)
+				} else {
+					bits = uint64(-digit-1) | msbWindow
+				}
+
+				toReturn[i][s.index] |= (bits << s.shift)
+				if s.multiWordSelect {
+					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				}
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
+}
+
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+func (p *E6) MultiExp(points []E6, scalars []fr.Element, config ecc.MultiExpConfig) (*E6, error) {
+	// note:
+	// each of the MsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each MsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented MsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 8, 16}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerE6 , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]E6, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerE6(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerE6(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.Mul(p, &_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerE6(p *E6, c int, points []E6, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.MsmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.MsmC5(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.MsmC8(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.MsmC16(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkE6 reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkE6(p *E6, c int, chChunks []chan E6) *E6 {
+	var _p E6
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.CyclotomicSquare(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.Mul(&_p, &totalj)
+	}
+
+	p.Set(&_p)
+	return p
+}
+
+func msmProcessChunkE6(chunk uint64,
+	chRes chan<- E6,
+	buckets []E6,
+	c uint64,
+	points []E6,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].SetOne()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		var tmp E6
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].Mul(&buckets[bits-1], &points[i])
+		} else {
+			// sub
+			tmp.Conjugate(&points[i])
+			buckets[bits & ^msbWindow].Mul(&buckets[bits & ^msbWindow], &tmp)
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var one, runningSum, total E6
+	runningSum.SetOne()
+	total.SetOne()
+	one.SetOne()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].Equal(&one) {
+			runningSum.Mul(&runningSum, &buckets[k])
+		}
+		total.Mul(&total, &runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *E6) MsmC4(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 {
+	const (
+		c        = 4                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E6
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E6, 1)
+	}
+
+	processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) {
+		var buckets [1 << (c - 1)]E6
+		msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E6, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE6(p, c, chChunks[:])
+}
+
+func (p *E6) MsmC5(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 {
+	const (
+		c        = 5                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E6
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E6, 1)
+	}
+
+	processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) {
+		var buckets [1 << (c - 1)]E6
+		msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E6, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE6(p, c, chChunks[:])
+}
+
+func (p *E6) MsmC8(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 {
+	const (
+		c        = 8                   // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E6
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E6, 1)
+	}
+
+	processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) {
+		var buckets [1 << (c - 1)]E6
+		msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E6, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE6(p, c, chChunks[:])
+}
+
+func (p *E6) MsmC16(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6 {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan E6
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan E6, 1)
+	}
+
+	processChunk := func(j int, points []E6, scalars []fr.Element, chChunk chan E6) {
+		var buckets [1 << (c - 1)]E6
+		msmProcessChunkE6(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan E6, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.Mul(&s1, &s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkE6(p, c, chChunks[:])
+}
diff --git a/ecc/bw6-756/internal/fptower/multiexp.go b/ecc/bw6-756/internal/fptower/multiexp.go
index 9179829638..cc82237c8a 100644
--- a/ecc/bw6-756/internal/fptower/multiexp.go
+++ b/ecc/bw6-756/internal/fptower/multiexp.go
@@ -443,7 +443,7 @@ func (p *E6) MsmC5(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6
 		chChunks[i] = make(chan E6, 1)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	// c doesn't divide 384, last window is smaller we can allocate less buckets
 	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
 	go func(j uint64, points []E6, scalars []fr.Element) {
 		var buckets [1 << (lastC - 1)]E6
diff --git a/ecc/bw6-761/internal/fptower/multiexp.go b/ecc/bw6-761/internal/fptower/multiexp.go
index 97216f80f9..a6996edbf8 100644
--- a/ecc/bw6-761/internal/fptower/multiexp.go
+++ b/ecc/bw6-761/internal/fptower/multiexp.go
@@ -443,7 +443,7 @@ func (p *E6) MsmC5(points []E6, scalars []fr.Element, splitFirstChunk bool) *E6
 		chChunks[i] = make(chan E6, 1)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	// c doesn't divide 384, last window is smaller we can allocate less buckets
 	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
 	go func(j uint64, points []E6, scalars []fr.Element) {
 		var buckets [1 << (lastC - 1)]E6