ingonyama-zk · yshekel · Nov 5, 2024 · Sep 10, 2024 · Sep 13, 2024 · Sep 16, 2024
diff --git a/docs/docs/icicle/golang-bindings/vec-ops.md b/docs/docs/icicle/golang-bindings/vec-ops.md
@@ -4,8 +4,8 @@
 
 Icicle exposes a number of vector operations which a user can use:
 
-* The VecOps API provides efficient vector operations such as addition, subtraction, and multiplication.
-* MatrixTranspose API allows a user to perform a transpose on a vector representation of a matrix
+* The VecOps API provides efficient vector operations such as addition, subtraction, and multiplication, supporting both single and batched operations.
+* MatrixTranspose API allows a user to perform a transpose on a vector representation of a matrix, with support for batched transpositions.
 
 ## VecOps API Documentation
 
@@ -121,6 +121,8 @@ type VecOpsConfig struct {
 	isBOnDevice      bool
 	isResultOnDevice bool
 	IsAsync          bool
+	batch_size       int
+	columns_batch    bool
 	Ext              config_extension.ConfigExtensionHandler
 }
 ```
@@ -132,6 +134,8 @@ type VecOpsConfig struct {
 - **`isBOnDevice`**: Indicates if vector `b` is located on the device.
 - **`isResultOnDevice`**: Specifies where the result vector should be stored (device or host memory).
 - **`IsAsync`**: Controls whether the vector operation runs asynchronously.
+- **`batch_size`**: Number of vectors (or operations) to process in a batch. Each vector operation will be performed independently on each batch element.
+- **`columns_batch`**: true if the batched vectors are stored as columns in a 2D array (i.e., the vectors are strided in memory as columns of a matrix). If false, the batched vectors are stored contiguously in memory (e.g., as rows or in a flat array).
 - **`Ext`**: Extended configuration for backend.
 
 #### Default Configuration
@@ -148,6 +152,8 @@ This section describes the functionality of the `TransposeMatrix` function used
 
 The function takes a matrix represented as a 1D slice and transposes it, storing the result in another 1D slice.
 
+If VecOpsConfig specifies a batch_size greater than one, the transposition is performed on multiple matrices simultaneously, producing corresponding transposed matrices. The storage arrangement of batched matrices is determined by the columns_batch field in the VecOpsConfig.
+
 ### Function
 
 ```go

diff --git a/docs/docs/icicle/primitives/vec_ops.md b/docs/docs/icicle/primitives/vec_ops.md
@@ -16,6 +16,8 @@ The `VecOpsConfig` struct is a configuration object used to specify parameters f
 - **`is_b_on_device: bool`**: Indicates whether the second input vector (`b`) is already on the device. If `false`, the vector will be copied from the host to the device. This field is optional.
 - **`is_result_on_device: bool`**: Indicates whether the result should be stored on the device. If `false`, the result will be transferred back to the host.
 - **`is_async: bool`**: Specifies whether the vector operation should be performed asynchronously. When `true`, the operation will not block the CPU, allowing other operations to proceed concurrently. Asynchronous execution requires careful synchronization to ensure data integrity.
+- **`batch_size: int`**: Number of vectors (or operations) to process in a batch. Each vector operation will be performed independently on each batch element.
+- **`columns_batch: bool`**: True if the batched vectors are stored as columns in a 2D array (i.e., the vectors are strided in memory as columns of a matrix). If false, the batched vectors are stored contiguously in memory (e.g., as rows or in a flat array).
 - **`ext: ConfigExtension*`**: Backend-specific extensions.
 
 #### Default Configuration
@@ -28,14 +30,17 @@ static VecOpsConfig default_vec_ops_config() {
       false,   // is_b_on_device
       false,   // is_result_on_device
       false,   // is_async
+      1,       // batch_size
+      false,   // columns_batch
+      nullptr  // ext
     };
     return config;
 }
 ```
 
 ### Element-wise Operations
 
-These functions perform element-wise operations on two input vectors `a` and `b`, producing an output vector.
+These functions perform element-wise operations on two input vectors a and b. If VecOpsConfig specifies a batch_size greater than one, the operations are performed on multiple pairs of vectors simultaneously, producing corresponding output vectors.
 
 #### `vector_add`
 
@@ -90,9 +95,31 @@ template <typename T>
 eIcicleError convert_montgomery(const T* input, uint64_t size, bool is_into, const VecOpsConfig& config, T* output);
 ```
 
+### Reduction operations
+
+These functions perform reduction operations on vectors. If VecOpsConfig specifies a batch_size greater than one, the operations are performed on multiple vectors simultaneously, producing corresponding output values. The storage arrangement of batched vectors is determined by the columns_batch field in the VecOpsConfig.
+
+#### `vector_sum`
+
+Computes the sum of all elements in each vector in a batch.
+
+```cpp
+template <typename T>
+eIcicleError vector_sum(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output);
+```
+
+#### `vector_product`
+
+Computes the product of all elements in each vector in a batch.
+
+```cpp
+template <typename T>
+eIcicleError vector_product(const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output);
+```
+
 ### Scalar-Vector Operations
 
-These functions apply a scalar operation to each element of a vector.
+These functions apply a scalar operation to each element of a vector. If VecOpsConfig specifies a batch_size greater than one, the operations are performed on multiple vector-scalar pairs simultaneously, producing corresponding output vectors.
 
 #### `scalar_add_vec / scalar_sub_vec`
 
@@ -123,7 +150,7 @@ eIcicleError scalar_mul_vec(const T* scalar_a, const T* vec_b, uint64_t size, co
 
 ### Matrix Operations
 
-These functions perform operations on matrices.
+These functions perform operations on matrices. If VecOpsConfig specifies a batch_size greater than one, the operations are performed on multiple matrices simultaneously, producing corresponding output matrices.
 
 #### `matrix_transpose`
 
@@ -138,7 +165,7 @@ eIcicleError matrix_transpose(const T* mat_in, uint32_t nof_rows, uint32_t nof_c
 
 #### `bit_reverse`
 
-Reorders the vector elements based on a bit-reversal pattern.
+Reorders the vector elements based on a bit-reversal pattern. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously.
 
 ```cpp
 template <typename T>
@@ -147,16 +174,16 @@ eIcicleError bit_reverse(const T* vec_in, uint64_t size, const VecOpsConfig& con
 
 #### `slice`
 
-Extracts a slice from a vector.
+Extracts a slice from a vector. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously, producing corresponding output vectors.
 
 ```cpp
 template <typename T>
-eIcicleError slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size, const VecOpsConfig& config, T* vec_out);
+eIcicleError slice(const T* vec_in, uint64_t offset, uint64_t stride, uint64_t size_in, uint64_t size_out, const VecOpsConfig& config, T* vec_out);
 ```
 
 #### `highest_non_zero_idx`
 
-Finds the highest non-zero index in a vector.
+Finds the highest non-zero index in a vector. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously.
 
 ```cpp
 template <typename T>
@@ -165,7 +192,7 @@ eIcicleError highest_non_zero_idx(const T* vec_in, uint64_t size, const VecOpsCo
 
 #### `polynomial_eval`
 
-Evaluates a polynomial at given domain points.
+Evaluates a polynomial at given domain points. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously.
 
 ```cpp
 template <typename T>
@@ -174,7 +201,7 @@ eIcicleError polynomial_eval(const T* coeffs, uint64_t coeffs_size, const T* dom
 
 #### `polynomial_division`
 
-Divides two polynomials.
+Divides two polynomials. If VecOpsConfig specifies a batch_size greater than one, the operation is performed on multiple vectors simultaneously.
 
 ```cpp
 template <typename T>

diff --git a/docs/docs/icicle/programmers_guide/general.md b/docs/docs/icicle/programmers_guide/general.md
@@ -21,6 +21,7 @@ The configuration struct allows users to modify settings such as:
 
 - Specifying whether inputs and outputs are on the host or device.
 - Adjusting the data layout for specific optimizations.
+- Setting batching parameters (batch_size and columns_batch) to perform operations on multiple data sets simultaneously.
 - Passing custom options to the backend implementation through an extension mechanism, such as setting the number of CPU cores to use.
 
 ### Example (C++)
@@ -31,6 +32,8 @@ The configuration struct allows users to modify settings such as:
 // Create config struct for vector add
 VecOpsConfig config = default_vec_ops_config();
 // optionally modify the config struct here
+config.batch_size = 4;          // Process 4 vector operations in a batch
+config.columns_batch = true;    // Batched vectors are stored as columns
 
 // Call the API
 eIcicleError err = vector_add(vec_a, vec_b, size, config, vec_res);
@@ -45,6 +48,8 @@ struct VecOpsConfig {
     bool is_b_on_device;       /**< True if `b` is on the device, false if it is not. Default value: false. OPTIONAL. */
     bool is_result_on_device;  /**< If true, the output is preserved on the device, otherwise on the host. Default value: false. */
     bool is_async;             /**< Whether to run the vector operations asynchronously. */
+    int batch_size;            /**< Number of vector operations to process in a batch. Default value: 1. */
+    bool columns_batch;        /**< True if batched vectors are stored as columns; false if stored contiguously. Default value: false. */
     ConfigExtension* ext = nullptr; /**< Backend-specific extension. */
 };
 ```

diff --git a/docs/docs/icicle/rust-bindings/vec-ops.md b/docs/docs/icicle/rust-bindings/vec-ops.md
@@ -1,10 +1,10 @@
 # Vector Operations API
 
-Our vector operations API includes fundamental methods for addition, subtraction, and multiplication of vectors, with support for both host and device memory.
+Our vector operations API includes fundamental methods for addition, subtraction, and multiplication of vectors, with support for both host and device memory, as well as batched operations.
 
 ## Vector Operations Configuration
 
-The `VecOpsConfig` struct encapsulates the settings for vector operations, including device context and operation modes.
+The `VecOpsConfig` struct encapsulates the settings for vector operations, including device context, operation modes, and batching parameters.
 
 ### `VecOpsConfig`
 
@@ -17,6 +17,8 @@ pub struct VecOpsConfig {
     pub is_b_on_device: bool,
     pub is_result_on_device: bool,
     pub is_async: bool,
+    pub batch_size: usize,
+    pub columns_batch: bool,
     pub ext: ConfigExtension,
 }
 ```
@@ -28,6 +30,9 @@ pub struct VecOpsConfig {
 - **`is_b_on_device: bool`**: Indicates whether the input b data has been preloaded on the device memory. If `false` inputs will be copied from host to device.
 - **`is_result_on_device: bool`**: Indicates whether the output data is preloaded in device memory. If `false` outputs will be copied from host to device.
 - **`is_async: bool`**: Specifies whether the NTT operation should be performed asynchronously.
+- **`batch_size: usize`**: Number of vector operations to process in a single batch. Each operation will be performed independently on each batch element.
+- **`columns_batch: bool`**: true if the batched vectors are stored as columns in a 2D array (i.e., the vectors are strided in memory as columns of a matrix). If false, the batched vectors are stored contiguously in memory (e.g., as rows or in a flat array).
+
 - **`ext: ConfigExtension`**: extended configuration for backend.
 
 ### Default Configuration
@@ -40,11 +45,11 @@ let cfg = VecOpsConfig::default();
 
 ## Vector Operations
 
-Vector operations are implemented through the `VecOps` trait, providing methods for addition, subtraction, and multiplication of vectors.
+Vector operations are implemented through the `VecOps` trait, providing methods for addition, subtraction, and multiplication of vectors. These methods support both single and batched operations based on the batch_size and columns_batch configurations.
 
 ### Methods
 
-All operations are element-wise operations, and the results placed into the `result` param. These operations are not in place.
+All operations are element-wise operations, and the results placed into the `result` param. These operations are not in place, except for accumulate.
 
 - **`add`**: Computes the element-wise sum of two vectors.
 - **`accumulate`**: Sum input b to a inplace.

diff --git a/examples/c++/polynomial-multiplication/example.cpp b/examples/c++/polynomial-multiplication/example.cpp
@@ -69,21 +69,18 @@ int main(int argc, char** argv)
     ICICLE_CHECK(bn254_ntt(polyB.get(), NTT_SIZE, NTTDir::kForward, &ntt_config, d_polyB));
 
     // (4) multiply A,B
-    VecOpsConfig config{
-      nullptr,
-      true,   // is_a_on_device
-      true,   // is_b_on_device
-      true,   // is_result_on_device
-      false,  // is_async
-      nullptr // ext
-    };
-    ICICLE_CHECK(bn254_vector_mul(d_polyA, d_polyB, NTT_SIZE, &config, d_polyRes));
+    VecOpsConfig config = default_vec_ops_config();
+    config.is_a_on_device = true;
+    config.is_b_on_device = true;
+    config.is_result_on_device = true;
+
+    ICICLE_CHECK(vector_mul(d_polyA, d_polyB, NTT_SIZE, config, d_polyRes));
 
     // (5) INTT (in place)
     ntt_config.are_inputs_on_device = true;
     ntt_config.are_outputs_on_device = true;
     ntt_config.ordering = Ordering::kMN;
-    ICICLE_CHECK(bn254_ntt(d_polyRes, NTT_SIZE, NTTDir::kInverse, &ntt_config, d_polyRes));
+    ICICLE_CHECK(ntt(d_polyRes, NTT_SIZE, NTTDir::kInverse, ntt_config, d_polyRes));
 
     if (print) { END_TIMER(poly_multiply, "polynomial multiplication took"); }