From 75ed67f41a4d297351976e419c5fe5d3cc18fe38 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Fri, 6 Oct 2023 14:39:38 +0200
Subject: [PATCH 01/18] Add batch::matrix::Ell class and core

Co-authored-by: Aditya Kashi <kashia@ornl.gov>
---
 core/matrix/batch_ell.cpp                | 235 ++++++++++++++
 include/ginkgo/core/matrix/batch_ell.hpp | 390 +++++++++++++++++++++++
 2 files changed, 625 insertions(+)
 create mode 100644 core/matrix/batch_ell.cpp
 create mode 100644 include/ginkgo/core/matrix/batch_ell.hpp
diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp
new file mode 100644
index 00000000000..63d4f0dda8a
--- /dev/null
+++ b/core/matrix/batch_ell.cpp
@@ -0,0 +1,235 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include <algorithm>
+#include <type_traits>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+
+
+#include "core/matrix/batch_ell_kernels.hpp"
+
+
+namespace gko {
+namespace batch {
+namespace matrix {
+namespace ell {
+namespace {
+
+
+GKO_REGISTER_OPERATION(simple_apply, batch_ell::simple_apply);
+GKO_REGISTER_OPERATION(advanced_apply, batch_ell::advanced_apply);
+
+
+}  // namespace
+}  // namespace ell
+
+
+namespace detail {
+
+
+template <typename ValueType, typename IndexType>
+batch_dim<2> compute_batch_size(
+    const std::vector<gko::matrix::Ell<ValueType, IndexType>*>& matrices)
+{
+    auto common_size = matrices[0]->get_size();
+    for (size_type i = 1; i < matrices.size(); ++i) {
+        GKO_ASSERT_EQUAL_DIMENSIONS(common_size, matrices[i]->get_size());
+    }
+    return batch_dim<2>{matrices.size(), common_size};
+}
+
+
+}  // namespace detail
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<gko::matrix::Ell<ValueType, IndexType>>
+Ell<ValueType, IndexType>::create_view_for_item(size_type item_id)
+{
+    auto exec = this->get_executor();
+    auto num_rows = this->get_common_size()[0];
+    auto stride = this->get_common_size()[1];
+    auto mat = unbatch_type::create(
+        exec, this->get_common_size(),
+        make_array_view(exec, this->get_num_elements_per_item(),
+                        this->get_values_for_item(item_id)),
+        make_array_view(exec, this->get_num_elements_per_item(),
+                        this->get_col_idxs_for_item(item_id)),
+        this->get_num_stored_elements_per_row(), stride);
+    return mat;
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<const gko::matrix::Ell<ValueType, IndexType>>
+Ell<ValueType, IndexType>::create_const_view_for_item(size_type item_id) const
+{
+    auto exec = this->get_executor();
+    auto num_rows = this->get_common_size()[0];
+    auto stride = this->get_common_size()[1];
+    auto mat = unbatch_type::create_const(
+        exec, this->get_common_size(),
+        make_const_array_view(exec, this->get_num_elements_per_item(),
+                              this->get_const_values_for_item(item_id)),
+        make_const_array_view(exec, this->get_num_elements_per_item(),
+                              this->get_const_col_idxs_for_item(item_id)),
+        this->get_num_stored_elements_per_row(), stride);
+    return mat;
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<Ell<ValueType, IndexType>>
+Ell<ValueType, IndexType>::create_with_config_of(
+    ptr_param<const Ell<ValueType, IndexType>> other)
+{
+    // De-referencing `other` before calling the functions (instead of
+    // using operator `->`) is currently required to be compatible with
+    // CUDA 10.1.
+    // Otherwise, it results in a compile error.
+    return (*other).create_with_same_config();
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<Ell<ValueType, IndexType>>
+Ell<ValueType, IndexType>::create_with_same_config() const
+{
+    return Ell<ValueType, IndexType>::create(
+        this->get_executor(), this->get_size(),
+        this->get_num_stored_elements_per_row());
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<const Ell<ValueType, IndexType>>
+Ell<ValueType, IndexType>::create_const(
+    std::shared_ptr<const Executor> exec, const batch_dim<2>& sizes,
+    int num_elems_per_row, gko::detail::const_array_view<ValueType>&& values,
+    gko::detail::const_array_view<IndexType>&& col_idxs)
+{
+    // cast const-ness away, but return a const object afterwards,
+    // so we can ensure that no modifications take place.
+    return std::unique_ptr<const Ell>(
+        new Ell{exec, sizes, num_elems_per_row,
+                gko::detail::array_const_cast(std::move(values)),
+                gko::detail::array_const_cast(std::move(col_idxs))});
+}
+
+
+inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes)
+{
+    return batch_dim<2>(sizes.get_num_batch_items(),
+                        dim<2>(1, sizes.get_common_size()[1]));
+}
+
+
+template <typename ValueType, typename IndexType>
+Ell<ValueType, IndexType>::Ell(std::shared_ptr<const Executor> exec,
+                               const batch_dim<2>& size, int num_elems_per_row)
+    : EnableBatchLinOp<Ell<ValueType, IndexType>>(exec, size),
+      num_elems_per_row_(num_elems_per_row),
+      values_(exec, compute_num_elems(size, num_elems_per_row)),
+      col_idxs_(exec, compute_num_elems(size, num_elems_per_row))
+{}
+
+
+template <typename ValueType, typename IndexType>
+void Ell<ValueType, IndexType>::apply_impl(const MultiVector<ValueType>* b,
+                                           MultiVector<ValueType>* x) const
+{
+    GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items());
+    GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items());
+
+    GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size());
+    GKO_ASSERT_EQUAL_ROWS(this->get_common_size(), x->get_common_size());
+    GKO_ASSERT_EQUAL_COLS(b->get_common_size(), x->get_common_size());
+    this->get_executor()->run(ell::make_simple_apply(this, b, x));
+}
+
+
+template <typename ValueType, typename IndexType>
+void Ell<ValueType, IndexType>::apply_impl(const MultiVector<ValueType>* alpha,
+                                           const MultiVector<ValueType>* b,
+                                           const MultiVector<ValueType>* beta,
+                                           MultiVector<ValueType>* x) const
+{
+    GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items());
+    GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items());
+
+    GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size());
+    GKO_ASSERT_EQUAL_ROWS(this->get_common_size(), x->get_common_size());
+    GKO_ASSERT_EQUAL_COLS(b->get_common_size(), x->get_common_size());
+    GKO_ASSERT_EQUAL_DIMENSIONS(alpha->get_common_size(), gko::dim<2>(1, 1));
+    GKO_ASSERT_EQUAL_DIMENSIONS(beta->get_common_size(), gko::dim<2>(1, 1));
+    this->get_executor()->run(
+        ell::make_advanced_apply(alpha, this, b, beta, x));
+}
+
+
+template <typename ValueType, typename IndexType>
+void Ell<ValueType, IndexType>::convert_to(
+    Ell<next_precision<ValueType, IndexType>>* result) const
+{
+    result->values_ = this->values_;
+    result->col_idxs_ = this->col_idxs_;
+    result->num_elems_per_row_ = this->num_elems_per_row_;
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType>
+void Ell<ValueType, IndexType>::move_to(
+    Ell<next_precision<ValueType, IndexType>>* result)
+{
+    this->convert_to(result);
+}
+
+
+#define GKO_DECLARE_BATCH_ELL_MATRIX(_type) class Ell<_vtype, _itype>
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_BATCH_ELL_MATRIX);
+
+
+}  // namespace matrix
+}  // namespace batch
+}  // namespace gko
diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp
new file mode 100644
index 00000000000..374f1479664
--- /dev/null
+++ b/include/ginkgo/core/matrix/batch_ell.hpp
@@ -0,0 +1,390 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_MATRIX_BATCH_ELL_HPP_
+#define GKO_PUBLIC_CORE_MATRIX_BATCH_ELL_HPP_
+
+
+#include <initializer_list>
+#include <vector>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/batch_lin_op.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/mtx_io.hpp>
+#include <ginkgo/core/base/range_accessors.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+
+
+namespace gko {
+namespace batch {
+namespace matrix {
+
+
+/**
+ * Ell is a batch matrix format which explicitly stores all values of the
+ * matrix in each of the batches.
+ *
+ * The values in each of the batches are stored in row-major format (values
+ * belonging to the same row appear consecutive in the memory and the values of
+ * each batch item are also stored consecutively in memory).
+ *
+ * @note Though the storage layout is similar to the multi-vector object, the
+ * class semantics and the operations it aims to provide is different. Hence it
+ * is recommended to create multi-vector objects if the user means to view the
+ * data as a set of vectors.
+ *
+ * @tparam ValueType  precision of matrix elements
+ *
+ * @ingroup batch_ell
+ * @ingroup mat_formats
+ * @ingroup BatchLinOp
+ */
+template <typename ValueType = default_precision, typename IndexType = int32>
+class Ell final
+    : public EnableBatchLinOp<Ell<ValueType, IndexType>>,
+      public EnableCreateMethod<Ell<ValueType, IndexType>>,
+      public ConvertibleTo<Ell<next_precision<ValueType>, IndexType>> {
+    friend class EnableCreateMethod<Ell>;
+    friend class EnablePolymorphicObject<Ell, BatchLinOp>;
+    friend class Ell<to_complex<ValueType>, IndexType>;
+    friend class Ell<next_precision<ValueType>, IndexType>;
+
+public:
+    using EnableBatchLinOp<Ell>::convert_to;
+    using EnableBatchLinOp<Ell>::move_to;
+
+    using value_type = ValueType;
+    using index_type = int32;
+    using transposed_type = Ell<ValueType, IndexType>;
+    using unbatch_type = gko::matrix::Ell<ValueType, IndexType>;
+    using absolute_type = remove_complex<Ell>;
+    using complex_type = to_complex<Ell>;
+
+    /**
+     * Creates a Ell matrix with the configuration of another Ell
+     * matrix.
+     *
+     * @param other  The other matrix whose configuration needs to copied.
+     */
+    static std::unique_ptr<Ell> create_with_config_of(
+        ptr_param<const Ell> other);
+
+    void convert_to(
+        Ell<next_precision<ValueType>, IndexType>* result) const override;
+
+    void move_to(Ell<next_precision<ValueType>, IndexType>* result) override;
+
+    /**
+     * Creates a mutable view (of matrix::Ell type) of one item of the
+     * batch::matrix::Ell<value_type> object. Does not perform any deep
+     * copies, but only returns a view of the data.
+     *
+     * @param item_id  The index of the batch item
+     *
+     * @return  a batch::matrix::Ell object with the data from the batch item
+     * at the given index.
+     */
+    std::unique_ptr<unbatch_type> create_view_for_item(size_type item_id);
+
+    /**
+     * @copydoc create_view_for_item(size_type)
+     */
+    std::unique_ptr<const unbatch_type> create_const_view_for_item(
+        size_type item_id) const;
+
+    /**
+     * Returns a pointer to the array of values of the matrix
+     *
+     * @return the pointer to the array of values
+     */
+    value_type* get_values() noexcept { return values_.get_data(); }
+
+    /**
+     * @copydoc get_values()
+     *
+     * @note This is the constant version of the function, which can be
+     *       significantly more memory efficient than the non-constant version,
+     *       so always prefer this version.
+     */
+    const value_type* get_const_values() const noexcept
+    {
+        return values_.get_const_data();
+    }
+
+    /**
+     * Returns a pointer to the array of column indices of the matrix
+     *
+     * @return the pointer to the array of column indices
+     */
+    index_type* get_col_idxs() noexcept { return col_idxs_.get_data(); }
+
+    /**
+     * @copydoc get_col_idxs()
+     *
+     * @note This is the constant version of the function, which can be
+     *       significantly more memory efficient than the non-constant version,
+     *       so always prefer this version.
+     */
+    const index_type* get_const_col_idxs() const noexcept
+    {
+        return col_idxs_.get_const_data();
+    }
+
+    /**
+     * Returns the number of elements per row explicitly stored.
+     *
+     * @return the number of elements stored in each row of the ELL matrix. Same
+     * for each batch item
+     */
+    int get_num_stored_elements_per_row() const noexcept
+    {
+        return num_elems_per_row_;
+    }
+
+    /**
+     * Returns the number of elements explicitly stored in the batch matrix,
+     * cumulative across all the batch items.
+     *
+     * @return the number of elements explicitly stored in the vector,
+     *         cumulative across all the batch items
+     */
+    size_type get_num_stored_elements() const noexcept
+    {
+        return values_.get_num_elems();
+    }
+
+    /**
+     * Returns the number of stored elements in each batch item.
+     *
+     * @return the number of stored elements per batch item.
+     */
+    size_type get_num_elements_per_item() const noexcept
+    {
+        return this->get_num_stored_elements() / this->get_num_batch_items();
+    }
+
+    /**
+     * Returns a pointer to the array of col_idxs of the matrix for a
+     * specific batch item.
+     *
+     * @param batch_id  the id of the batch item.
+     *
+     * @return the pointer to the array of col_idxs
+     */
+    value_type* get_col_idxs_for_item(size_type batch_id) noexcept
+    {
+        GKO_ASSERT(batch_id < this->get_num_batch_items());
+        return col_idxs_.get_data() +
+               batch_id * this->get_num_elements_per_item();
+    }
+
+    /**
+     * @copydoc get_col_idxs_for_item(size_type)
+     *
+     * @note This is the constant version of the function, which can be
+     *       significantly more memory efficient than the non-constant version,
+     *       so always prefer this version.
+     */
+    const value_type* get_const_col_idxs_for_item(
+        size_type batch_id) const noexcept
+    {
+        GKO_ASSERT(batch_id < this->get_num_batch_items());
+        return col_idxs_.get_const_data() +
+               batch_id * this->get_num_elements_per_item();
+    }
+
+    /**
+     * Returns a pointer to the array of values of the matrix for a
+     * specific batch item.
+     *
+     * @param batch_id  the id of the batch item.
+     *
+     * @return the pointer to the array of values
+     */
+    value_type* get_values_for_item(size_type batch_id) noexcept
+    {
+        GKO_ASSERT(batch_id < this->get_num_batch_items());
+        return values_.get_data() +
+               batch_id * this->get_num_elements_per_item();
+    }
+
+    /**
+     * @copydoc get_values_for_item(size_type)
+     *
+     * @note This is the constant version of the function, which can be
+     *       significantly more memory efficient than the non-constant version,
+     *       so always prefer this version.
+     */
+    const value_type* get_const_values_for_item(
+        size_type batch_id) const noexcept
+    {
+        GKO_ASSERT(batch_id < this->get_num_batch_items());
+        return values_.get_const_data() +
+               batch_id * this->get_num_elements_per_item();
+    }
+
+    /**
+     * Creates a constant (immutable) batch ell matrix from a constant
+     * array.
+     *
+     * @param exec  the executor to create the matrix on
+     * @param size  the dimensions of the matrix
+     * @param num_elems_per_row  the number of elements to be stored in each row
+     * @param values  the value array of the matrix
+     * @param col_idxs the col_idxs array of the matrix
+     *
+     * @return A smart pointer to the constant matrix wrapping the input
+     * array (if it resides on the same executor as the matrix) or a copy of the
+     * array on the correct executor.
+     */
+    static std::unique_ptr<const Ell<value_type, index_type>> create_const(
+        std::shared_ptr<const Executor> exec, const batch_dim<2>& sizes,
+        const int num_elems_per_row,
+        gko::detail::const_array_view<ValueType>&& values,
+        gko::detail::const_array_view<IndexType>&& col_idxs);
+
+    /**
+     * Apply the matrix to a multi-vector. Represents the matrix vector
+     * multiplication, x = A * b, where x and b are both multi-vectors.
+     *
+     * @param b  the multi-vector to be applied to
+     * @param x  the output multi-vector
+     */
+    void apply(const MultiVector<value_type>* b,
+               MultiVector<value_type>* x) const
+    {
+        this->apply_impl(b, x);
+    }
+
+    /**
+     * Apply the matrix to a multi-vector with a linear combination of the given
+     * input vector. Represents the matrix vector multiplication, x = alpha* A *
+     * b + beta * x, where x and b are both multi-vectors.
+     *
+     * @param alpha  the scalar to scale the matrix-vector product with
+     * @param b      the multi-vector to be applied to
+     * @param beta   the scalar to scale the x vector with
+     * @param x      the output multi-vector
+     */
+    void apply(const MultiVector<value_type>* alpha,
+               const MultiVector<value_type>* b,
+               const MultiVector<value_type>* beta,
+               MultiVector<value_type>* x) const
+    {
+        this->apply_impl(alpha, b, beta, x);
+    }
+
+private:
+    size_type compute_num_elems(const batch_dim<2>& size, int num_elems_per_row)
+    {
+        return size->get_common_size()[0] * num_elems_per_row;
+    }
+
+
+protected:
+    /**
+     * Creates an uninitialized Ell matrix of the specified size.
+     *
+     * @param exec  Executor associated to the matrix
+     * @param size  size of the matrix
+     * @param num_elems_per_row  the number of elements to be stored in each row
+     */
+    Ell(std::shared_ptr<const Executor> exec,
+        const batch_dim<2>& size = batch_dim<2>{},
+        const int num_elems_per_row = 0);
+
+    /**
+     * Creates a Ell matrix from an already allocated (and initialized)
+     * array.
+     *
+     * @tparam ValuesArray  type of array of values
+     *
+     * @param exec  Executor associated to the matrix
+     * @param size  size of the matrix
+     * @param num_elems_per_row  the number of elements to be stored in each row
+     * @param values  array of matrix values
+     * @param col_idxs the col_idxs array of the matrix
+     *
+     * @note If `values` is not an rvalue, not an array of ValueType, or is on
+     *       the wrong executor, an internal copy will be created, and the
+     *       original array data will not be used in the matrix.
+     */
+    template <typename ValuesArray, typename IndicesArray>
+    Ell(std::shared_ptr<const Executor> exec, const batch_dim<2>& size,
+        const int num_elems_per_row, ValuesArray&& values,
+        IndicesArray&& col_idxs)
+        : EnableBatchLinOp<Ell>(exec, size),
+          num_elems_per_row_{num_elems_per_row},
+          values_{exec, std::forward<ValuesArray>(values)},
+          col_idxs_{exec, std::forward<IndicesArray>(col_idxs)}
+    {
+        // Ensure that the value and col_idxs arrays have the correct size
+        auto num_elems = this->get_size()[0] * num_elems_per_row() *
+                         this->get_num_batch_items();
+        GKO_ENSURE_IN_BOUNDS(num_elems, values_.get_num_elems() + 1);
+        GKO_ENSURE_IN_BOUNDS(num_elems, col_idxs_.get_num_elems() + 1);
+    }
+
+    /**
+     * Creates a Ell matrix with the same configuration as the callers
+     * matrix.
+     *
+     * @returns a Ell matrix with the same configuration as the caller.
+     */
+    std::unique_ptr<Ell> create_with_same_config() const;
+
+    void apply_impl(const MultiVector<value_type>* b,
+                    MultiVector<value_type>* x) const;
+
+    void apply_impl(const MultiVector<value_type>* alpha,
+                    const MultiVector<value_type>* b,
+                    const MultiVector<value_type>* beta,
+                    MultiVector<value_type>* x) const;
+
+private:
+    int num_elems_per_row_;
+    array<value_type> values_;
+    array<index_type> col_idxs_;
+};
+
+
+}  // namespace matrix
+}  // namespace batch
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_MATRIX_BATCH_ELL_HPP_

From d3b8ad1e7933851be777937705d1e3dcff618f54 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Fri, 6 Oct 2023 15:13:34 +0200
Subject: [PATCH 02/18] Add ref, omp kernels and scaffold

Co-authored-by: Aditya Kashi <kashia@ornl.gov>
---
 core/CMakeLists.txt                        |   1 +
 core/device_hooks/common_kernels.inc.cpp   |  10 +
 core/matrix/batch_ell_kernels.hpp          |  84 ++++
 core/test/matrix/CMakeLists.txt            |   1 +
 core/test/matrix/batch_ell.cpp             | 478 +++++++++++++++++++++
 cuda/CMakeLists.txt                        |   1 +
 cuda/matrix/batch_ell_kernels.cu           |  86 ++++
 dpcpp/CMakeLists.txt                       |   1 +
 dpcpp/matrix/batch_ell_kernels.dp.cpp      | 102 +++++
 hip/CMakeLists.txt                         |   1 +
 hip/matrix/batch_ell_kernels.hip.cpp       |  86 ++++
 omp/CMakeLists.txt                         |   1 +
 omp/matrix/batch_ell_kernels.cpp           | 117 +++++
 reference/CMakeLists.txt                   |   1 +
 reference/matrix/batch_ell_kernels.cpp     | 116 +++++
 reference/matrix/batch_ell_kernels.hpp.inc |  78 ++++
 16 files changed, 1164 insertions(+)
 create mode 100644 core/matrix/batch_ell_kernels.hpp
 create mode 100644 core/test/matrix/batch_ell.cpp
 create mode 100644 cuda/matrix/batch_ell_kernels.cu
 create mode 100644 dpcpp/matrix/batch_ell_kernels.dp.cpp
 create mode 100644 hip/matrix/batch_ell_kernels.hip.cpp
 create mode 100644 omp/matrix/batch_ell_kernels.cpp
 create mode 100644 reference/matrix/batch_ell_kernels.cpp
 create mode 100644 reference/matrix/batch_ell_kernels.hpp.inc

diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index 46ea67abc65..ae8035bcbf9 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -40,6 +40,7 @@ target_sources(ginkgo
     log/record.cpp
     log/stream.cpp
     matrix/batch_dense.cpp
+    matrix/batch_ell.cpp
     matrix/coo.cpp
     matrix/csr.cpp
     matrix/dense.cpp
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 87cab3dcf0b..b685063da10 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -310,6 +310,16 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL);
 }  // namespace batch_dense
 
 
+namespace batch_ell {
+
+
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
+
+
+}  // namespace batch_ell
+
+
 namespace dense {
 
 
diff --git a/core/matrix/batch_ell_kernels.hpp b/core/matrix/batch_ell_kernels.hpp
new file mode 100644
index 00000000000..1b1ef345ae0
--- /dev/null
+++ b/core/matrix/batch_ell_kernels.hpp
@@ -0,0 +1,84 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_MATRIX_BATCH_ELL_KERNELS_HPP_
+#define GKO_CORE_MATRIX_BATCH_ELL_KERNELS_HPP_
+
+
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+#include "core/base/kernel_declaration.hpp"
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL(_vtype, _itype)  \
+    void simple_apply(std::shared_ptr<const DefaultExecutor> exec, \
+                      const batch::matrix::Ell<_vtype, _itype>* a, \
+                      const batch::MultiVector<_vtype, _itype>* b, \
+                      batch::MultiVector<_vtype, _itype>* c)
+
+#define GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL(_vtype, _itype)      \
+    void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,     \
+                        const batch::MultiVector<_vtype, _itype>* alpha, \
+                        const batch::matrix::Ell<_vtype, _itype>* a,     \
+                        const batch::MultiVector<_vtype, _itype>* b,     \
+                        const batch::MultiVector<_vtype, _itype>* beta,  \
+                        batch::MultiVector<_vtype, _itype>* c)
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                 \
+    template <typename ValueType, typename IndexType>                \
+    GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL(ValueType, IndexType); \
+    template <typename ValueType, typename IndexType>                \
+    GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL(ValueType, IndexType)
+
+
+GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(batch_ell,
+                                        GKO_DECLARE_ALL_AS_TEMPLATES);
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_MATRIX_BATCH_ELL_KERNELS_HPP_
diff --git a/core/test/matrix/CMakeLists.txt b/core/test/matrix/CMakeLists.txt
index cca4b8da1c0..ec7ef93e517 100644
--- a/core/test/matrix/CMakeLists.txt
+++ b/core/test/matrix/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_test(batch_dense)
+ginkgo_create_test(batch_ell)
 ginkgo_create_test(coo)
 ginkgo_create_test(coo_builder)
 ginkgo_create_test(csr)
diff --git a/core/test/matrix/batch_ell.cpp b/core/test/matrix/batch_ell.cpp
new file mode 100644
index 00000000000..931efb47d2e
--- /dev/null
+++ b/core/test/matrix/batch_ell.cpp
@@ -0,0 +1,478 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/range.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+
+
+#include "core/base/batch_utilities.hpp"
+#include "core/test/utils.hpp"
+#include "core/test/utils/batch_helpers.hpp"
+
+
+template <typename T>
+class Ell : public ::testing::Test {
+protected:
+    using value_type = T;
+    using EllMtx = gko::matrix::Ell<value_type>;
+    using size_type = gko::size_type;
+    Ell()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx(gko::batch::initialize<gko::batch::matrix::Ell<value_type>>(
+              {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
+               {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}},
+              exec)),
+          mvec(gko::batch::initialize<gko::batch::MultiVector<value_type>>(
+              {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
+               {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}},
+              exec)),
+          ell_mtx(gko::initialize<gko::matrix::Ell<value_type>>(
+              {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, exec))
+    {}
+
+
+    static void assert_equal_to_original_mtx(
+        gko::batch::matrix::Ell<value_type>* m)
+    {
+        ASSERT_EQ(m->get_num_batch_items(), 2);
+        ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3));
+        ASSERT_EQ(m->get_num_stored_elements(), 2 * (2 * 3));
+        EXPECT_EQ(m->at(0, 0, 0), value_type{-1.0});
+        EXPECT_EQ(m->at(0, 0, 1), value_type{2.0});
+        EXPECT_EQ(m->at(0, 0, 2), value_type{3.0});
+        EXPECT_EQ(m->at(0, 1, 0), value_type{-1.5});
+        EXPECT_EQ(m->at(0, 1, 1), value_type{2.5});
+        ASSERT_EQ(m->at(0, 1, 2), value_type{3.5});
+        EXPECT_EQ(m->at(1, 0, 0), value_type{1.0});
+        EXPECT_EQ(m->at(1, 0, 1), value_type{2.5});
+        EXPECT_EQ(m->at(1, 0, 2), value_type{3.0});
+        EXPECT_EQ(m->at(1, 1, 0), value_type{1.0});
+        EXPECT_EQ(m->at(1, 1, 1), value_type{2.0});
+        ASSERT_EQ(m->at(1, 1, 2), value_type{3.0});
+    }
+
+    static void assert_empty(gko::batch::matrix::Ell<value_type>* m)
+    {
+        ASSERT_EQ(m->get_num_batch_items(), 0);
+        ASSERT_EQ(m->get_num_stored_elements(), 0);
+    }
+
+    std::shared_ptr<const gko::Executor> exec;
+    std::unique_ptr<gko::batch::matrix::Ell<value_type>> mtx;
+    std::unique_ptr<gko::batch::MultiVector<value_type>> mvec;
+    std::unique_ptr<gko::matrix::Ell<value_type>> ell_mtx;
+};
+
+TYPED_TEST_SUITE(Ell, gko::test::ValueTypes);
+
+
+TYPED_TEST(Ell, KnowsItsSizeAndValues)
+{
+    this->assert_equal_to_original_mtx(this->mtx.get());
+}
+
+
+TYPED_TEST(Ell, CanBeEmpty)
+{
+    auto empty = gko::batch::matrix::Ell<TypeParam>::create(this->exec);
+    this->assert_empty(empty.get());
+}
+
+
+TYPED_TEST(Ell, ReturnsNullValuesArrayWhenEmpty)
+{
+    auto empty = gko::batch::matrix::Ell<TypeParam>::create(this->exec);
+    ASSERT_EQ(empty->get_const_values(), nullptr);
+}
+
+
+TYPED_TEST(Ell, CanGetValuesForEntry)
+{
+    using value_type = typename TestFixture::value_type;
+
+    ASSERT_EQ(this->mtx->get_values_for_item(1)[0], value_type{1.0});
+}
+
+
+TYPED_TEST(Ell, CanCreateEllItemView)
+{
+    GKO_ASSERT_MTX_NEAR(this->mtx->create_view_for_item(1), this->ell_mtx, 0.0);
+}
+
+
+TYPED_TEST(Ell, CanCreateMultiVectorView)
+{
+    GKO_ASSERT_BATCH_MTX_NEAR(this->mtx->create_multi_vector_view(), this->mvec,
+                              0.0);
+}
+
+
+TYPED_TEST(Ell, CanBeCopied)
+{
+    auto mtx_copy = gko::batch::matrix::Ell<TypeParam>::create(this->exec);
+
+    mtx_copy->copy_from(this->mtx.get());
+
+    this->assert_equal_to_original_mtx(this->mtx.get());
+    this->mtx->at(0, 0, 0) = 7;
+    this->mtx->at(0, 1) = 7;
+    this->assert_equal_to_original_mtx(mtx_copy.get());
+}
+
+
+TYPED_TEST(Ell, CanBeMoved)
+{
+    auto mtx_copy = gko::batch::matrix::Ell<TypeParam>::create(this->exec);
+
+    this->mtx->move_to(mtx_copy);
+
+    this->assert_equal_to_original_mtx(mtx_copy.get());
+}
+
+
+TYPED_TEST(Ell, CanBeCloned)
+{
+    auto mtx_clone = this->mtx->clone();
+
+    this->assert_equal_to_original_mtx(
+        dynamic_cast<decltype(this->mtx.get())>(mtx_clone.get()));
+}
+
+
+TYPED_TEST(Ell, CanBeCleared)
+{
+    this->mtx->clear();
+
+    this->assert_empty(this->mtx.get());
+}
+
+
+TYPED_TEST(Ell, CanBeConstructedWithSize)
+{
+    using size_type = gko::size_type;
+
+    auto m = gko::batch::matrix::Ell<TypeParam>::create(
+        this->exec, gko::batch_dim<2>(2, gko::dim<2>{5, 3}));
+
+    ASSERT_EQ(m->get_num_batch_items(), 2);
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(5, 3));
+    ASSERT_EQ(m->get_num_stored_elements(), 30);
+}
+
+
+TYPED_TEST(Ell, CanBeConstructedFromExistingData)
+{
+    using value_type = typename TestFixture::value_type;
+    using size_type = gko::size_type;
+    // clang-format off
+    value_type data[] = {
+       1.0,  2.0,
+      -1.0,  3.0,
+       4.0, -1.0,
+       3.0,  5.0,
+       1.0,  5.0,
+       6.0, -3.0};
+    // clang-format on
+
+    auto m = gko::batch::matrix::Ell<TypeParam>::create(
+        this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)),
+        gko::array<value_type>::view(this->exec, 8, data));
+
+    ASSERT_EQ(m->get_const_values(), data);
+    ASSERT_EQ(m->at(0, 0, 0), value_type{1.0});
+    ASSERT_EQ(m->at(0, 0, 1), value_type{2.0});
+    ASSERT_EQ(m->at(0, 1, 0), value_type{-1.0});
+    ASSERT_EQ(m->at(0, 1, 1), value_type{3.0});
+    ASSERT_EQ(m->at(1, 0, 0), value_type{4.0});
+    ASSERT_EQ(m->at(1, 0, 1), value_type{-1.0});
+    ASSERT_EQ(m->at(1, 1, 0), value_type{3.0});
+    ASSERT_EQ(m->at(1, 1, 1), value_type{5.0});
+}
+
+
+TYPED_TEST(Ell, CanBeConstructedFromExistingConstData)
+{
+    using value_type = typename TestFixture::value_type;
+    using size_type = gko::size_type;
+    // clang-format off
+    const value_type data[] = {
+       1.0,  2.0,
+      -1.0,  3.0,
+       4.0, -1.0,
+       3.0,  5.0,
+       1.0,  5.0,
+       6.0, -3.0};
+    // clang-format on
+
+    auto m = gko::batch::matrix::Ell<TypeParam>::create_const(
+        this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)),
+        gko::array<value_type>::const_view(this->exec, 8, data));
+
+    ASSERT_EQ(m->get_const_values(), data);
+    ASSERT_EQ(m->at(0, 0, 0), value_type{1.0});
+    ASSERT_EQ(m->at(0, 0, 1), value_type{2.0});
+    ASSERT_EQ(m->at(0, 1, 0), value_type{-1.0});
+    ASSERT_EQ(m->at(0, 1, 1), value_type{3.0});
+    ASSERT_EQ(m->at(1, 0, 0), value_type{4.0});
+    ASSERT_EQ(m->at(1, 0, 1), value_type{-1.0});
+    ASSERT_EQ(m->at(1, 1, 0), value_type{3.0});
+    ASSERT_EQ(m->at(1, 1, 1), value_type{5.0});
+}
+
+
+TYPED_TEST(Ell, CanBeConstructedFromEllMatrices)
+{
+    using value_type = typename TestFixture::value_type;
+    using EllMtx = typename TestFixture::EllMtx;
+    using size_type = gko::size_type;
+
+    auto mat1 = gko::initialize<EllMtx>({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
+                                        this->exec);
+    auto mat2 =
+        gko::initialize<EllMtx>({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec);
+
+    auto m = gko::batch::create_from_item<gko::batch::matrix::Ell<value_type>>(
+        this->exec, std::vector<EllMtx*>{mat1.get(), mat2.get()});
+
+    this->assert_equal_to_original_mtx(m.get());
+}
+
+
+TYPED_TEST(Ell, CanBeConstructedFromEllMatricesByDuplication)
+{
+    using value_type = typename TestFixture::value_type;
+    using EllMtx = typename TestFixture::EllMtx;
+    using size_type = gko::size_type;
+
+    auto mat1 = gko::initialize<EllMtx>(4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
+                                        this->exec);
+    auto mat2 =
+        gko::initialize<EllMtx>({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec);
+
+    auto bat_m =
+        gko::batch::create_from_item<gko::batch::matrix::Ell<value_type>>(
+            this->exec,
+            std::vector<EllMtx*>{mat1.get(), mat1.get(), mat1.get()});
+    auto m = gko::batch::create_from_item<gko::batch::matrix::Ell<value_type>>(
+        this->exec, 3, mat1.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(bat_m.get(), m.get(), 1e-14);
+}
+
+
+TYPED_TEST(Ell, CanBeConstructedByDuplicatingEllMatrices)
+{
+    using value_type = typename TestFixture::value_type;
+    using EllMtx = typename TestFixture::EllMtx;
+    using size_type = gko::size_type;
+
+    auto mat1 = gko::initialize<EllMtx>({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
+                                        this->exec);
+    auto mat2 =
+        gko::initialize<EllMtx>({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec);
+
+    auto m = gko::batch::create_from_item<gko::batch::matrix::Ell<value_type>>(
+        this->exec, std::vector<EllMtx*>{mat1.get(), mat2.get()});
+    auto m_ref =
+        gko::batch::create_from_item<gko::batch::matrix::Ell<value_type>>(
+            this->exec,
+            std::vector<EllMtx*>{mat1.get(), mat2.get(), mat1.get(), mat2.get(),
+                                 mat1.get(), mat2.get()});
+
+    auto m2 = gko::batch::duplicate<gko::batch::matrix::Ell<value_type>>(
+        this->exec, 3, m.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14);
+}
+
+
+TYPED_TEST(Ell, CanBeUnbatchedIntoEllMatrices)
+{
+    using value_type = typename TestFixture::value_type;
+    using EllMtx = typename TestFixture::EllMtx;
+    using size_type = gko::size_type;
+    auto mat1 = gko::initialize<EllMtx>(4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
+                                        this->exec);
+    auto mat2 =
+        gko::initialize<EllMtx>({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec);
+
+    auto ell_mats = gko::batch::unbatch<gko::batch::matrix::Ell<value_type>>(
+        this->mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(ell_mats[0].get(), mat1.get(), 0.);
+    GKO_ASSERT_MTX_NEAR(ell_mats[1].get(), mat2.get(), 0.);
+}
+
+
+TYPED_TEST(Ell, CanBeListConstructed)
+{
+    using value_type = typename TestFixture::value_type;
+    auto m = gko::batch::initialize<gko::batch::matrix::Ell<TypeParam>>(
+        {{1.0, 2.0}, {1.0, 3.0}}, this->exec);
+
+    ASSERT_EQ(m->get_num_batch_items(), 2);
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1));
+    EXPECT_EQ(m->at(0, 0), value_type{1});
+    EXPECT_EQ(m->at(0, 1), value_type{2});
+    EXPECT_EQ(m->at(1, 0), value_type{1});
+    EXPECT_EQ(m->at(1, 1), value_type{3});
+}
+
+
+TYPED_TEST(Ell, CanBeListConstructedByCopies)
+{
+    using value_type = typename TestFixture::value_type;
+
+    auto m = gko::batch::initialize<gko::batch::matrix::Ell<TypeParam>>(
+        2, I<value_type>({1.0, 2.0}), this->exec);
+
+    ASSERT_EQ(m->get_num_batch_items(), 2);
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1));
+    EXPECT_EQ(m->at(0, 0, 0), value_type{1.0});
+    EXPECT_EQ(m->at(0, 0, 1), value_type{2.0});
+    EXPECT_EQ(m->at(1, 0, 0), value_type{1.0});
+    EXPECT_EQ(m->at(1, 0, 1), value_type{2.0});
+}
+
+
+TYPED_TEST(Ell, CanBeDoubleListConstructed)
+{
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+
+    auto m = gko::batch::initialize<gko::batch::matrix::Ell<TypeParam>>(
+        {{I<T>{1.0, 1.0, 0.0}, I<T>{2.0, 4.0, 3.0}, I<T>{3.0, 6.0, 1.0}},
+         {I<T>{1.0, 2.0, -1.0}, I<T>{3.0, 4.0, -2.0}, I<T>{5.0, 6.0, -3.0}}},
+        this->exec);
+
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(3, 3));
+    EXPECT_EQ(m->at(0, 0), value_type{1.0});
+    EXPECT_EQ(m->at(0, 1), value_type{1.0});
+    EXPECT_EQ(m->at(0, 2), value_type{0.0});
+    ASSERT_EQ(m->at(0, 3), value_type{2.0});
+    EXPECT_EQ(m->at(0, 4), value_type{4.0});
+    EXPECT_EQ(m->at(1, 0), value_type{1.0});
+    EXPECT_EQ(m->at(1, 1), value_type{2.0});
+    EXPECT_EQ(m->at(1, 2), value_type{-1.0});
+    ASSERT_EQ(m->at(1, 3), value_type{3.0});
+    EXPECT_EQ(m->at(1, 4), value_type{4.0});
+}
+
+
+TYPED_TEST(Ell, CanBeReadFromMatrixData)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = int;
+
+    auto vec_data = std::vector<gko::matrix_data<value_type, index_type>>{};
+    vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
+        {2, 2}, {{0, 0, 1.0}, {0, 1, 3.0}, {1, 0, 0.0}, {1, 1, 5.0}}));
+    vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
+        {2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, {1, 0, 0.0}, {1, 1, 9.0}}));
+
+    auto m = gko::batch::read<value_type, index_type,
+                              gko::batch::matrix::Ell<value_type>>(this->exec,
+                                                                   vec_data);
+
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2));
+    EXPECT_EQ(m->at(0, 0, 0), value_type{1.0});
+    EXPECT_EQ(m->at(0, 0, 1), value_type{3.0});
+    EXPECT_EQ(m->at(0, 1, 0), value_type{0.0});
+    EXPECT_EQ(m->at(0, 1, 1), value_type{5.0});
+    EXPECT_EQ(m->at(1, 0, 0), value_type{-1.0});
+    EXPECT_EQ(m->at(1, 0, 1), value_type{0.5});
+    EXPECT_EQ(m->at(1, 1, 0), value_type{0.0});
+    EXPECT_EQ(m->at(1, 1, 1), value_type{9.0});
+}
+
+
+TYPED_TEST(Ell, CanBeReadFromSparseMatrixData)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = int;
+    auto vec_data = std::vector<gko::matrix_data<value_type, index_type>>{};
+    vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
+        {2, 2}, {{0, 0, 1.0}, {0, 1, 3.0}, {1, 1, 5.0}}));
+    vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
+        {2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, {1, 1, 9.0}}));
+
+    auto m = gko::batch::read<value_type, index_type,
+                              gko::batch::matrix::Ell<value_type>>(this->exec,
+                                                                   vec_data);
+
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2));
+    EXPECT_EQ(m->at(0, 0, 0), value_type{1.0});
+    EXPECT_EQ(m->at(0, 0, 1), value_type{3.0});
+    EXPECT_EQ(m->at(0, 1, 0), value_type{0.0});
+    EXPECT_EQ(m->at(0, 1, 1), value_type{5.0});
+    EXPECT_EQ(m->at(1, 0, 0), value_type{-1.0});
+    EXPECT_EQ(m->at(1, 0, 1), value_type{0.5});
+    EXPECT_EQ(m->at(1, 1, 0), value_type{0.0});
+    EXPECT_EQ(m->at(1, 1, 1), value_type{9.0});
+}
+
+
+TYPED_TEST(Ell, GeneratesCorrectMatrixData)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = int;
+    using tpl = typename gko::matrix_data<TypeParam>::nonzero_type;
+
+    auto data =
+        gko::batch::write<value_type, index_type,
+                          gko::batch::matrix::Ell<value_type>>(this->mtx.get());
+
+    ASSERT_EQ(data[0].size, gko::dim<2>(2, 3));
+    ASSERT_EQ(data[0].nonzeros.size(), 6);
+    EXPECT_EQ(data[0].nonzeros[0], tpl(0, 0, value_type{-1.0}));
+    EXPECT_EQ(data[0].nonzeros[1], tpl(0, 1, value_type{2.0}));
+    EXPECT_EQ(data[0].nonzeros[2], tpl(0, 2, value_type{3.0}));
+    EXPECT_EQ(data[0].nonzeros[3], tpl(1, 0, value_type{-1.5}));
+    EXPECT_EQ(data[0].nonzeros[4], tpl(1, 1, value_type{2.5}));
+    EXPECT_EQ(data[0].nonzeros[5], tpl(1, 2, value_type{3.5}));
+    ASSERT_EQ(data[1].size, gko::dim<2>(2, 3));
+    ASSERT_EQ(data[1].nonzeros.size(), 6);
+    EXPECT_EQ(data[1].nonzeros[0], tpl(0, 0, value_type{1.0}));
+    EXPECT_EQ(data[1].nonzeros[1], tpl(0, 1, value_type{2.5}));
+    EXPECT_EQ(data[1].nonzeros[2], tpl(0, 2, value_type{3.0}));
+    EXPECT_EQ(data[1].nonzeros[3], tpl(1, 0, value_type{1.0}));
+    EXPECT_EQ(data[1].nonzeros[4], tpl(1, 1, value_type{2.0}));
+    EXPECT_EQ(data[1].nonzeros[5], tpl(1, 2, value_type{3.0}));
+}
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index dfa1b2177ee..f5b7932ed39 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -39,6 +39,7 @@ target_sources(ginkgo_cuda
     factorization/par_ilut_spgeam_kernel.cu
     factorization/par_ilut_sweep_kernel.cu
     matrix/batch_dense_kernels.cu
+    matrix/batch_ell_kernels.cu
     matrix/coo_kernels.cu
     ${CSR_INSTANTIATE}
     matrix/dense_kernels.cu
diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu
new file mode 100644
index 00000000000..c41b436daed
--- /dev/null
+++ b/cuda/matrix/batch_ell_kernels.cu
@@ -0,0 +1,86 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/batch_dense_kernels.hpp"
+
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "cuda/base/batch_struct.hpp"
+#include "cuda/base/config.hpp"
+#include "cuda/base/cublas_bindings.hpp"
+#include "cuda/base/pointer_mode_guard.hpp"
+#include "cuda/base/thrust.cuh"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
+#include "cuda/components/uninitialized_array.hpp"
+#include "cuda/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The Ell matrix format namespace.
+ * @ref Ell
+ * @ingroup batch_ell
+ */
+namespace batch_ell {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_ell
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 9990496c98f..9c2e799ede9 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -37,6 +37,7 @@ target_sources(ginkgo_dpcpp
     factorization/par_ilut_spgeam_kernel.dp.cpp
     factorization/par_ilut_sweep_kernel.dp.cpp
     matrix/batch_dense_kernels.dp.cpp
+    matrix/batch_ell_kernels.dp.cpp
     matrix/coo_kernels.dp.cpp
     matrix/csr_kernels.dp.cpp
     matrix/fbcsr_kernels.dp.cpp
diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp
new file mode 100644
index 00000000000..f886b7dd790
--- /dev/null
+++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp
@@ -0,0 +1,102 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/batch_dense_kernels.hpp"
+
+
+#include <algorithm>
+
+
+#include <CL/sycl.hpp>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+/**
+ * @brief The Ell matrix format namespace.
+ * @ref Ell
+ * @ingroup batch_ell
+ */
+namespace batch_ell {
+
+
+// #include "dpcpp/matrix/batch_dense_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const batch::matrix::Ell<ValueType, IndexType>* mat,
+                  const batch::MultiVector<ValueType>* b,
+                  batch::MultiVector<ValueType>* x) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
+                    const batch::MultiVector<ValueType>* alpha,
+                    const batch::matrix::Ell<ValueType, IndexType>* mat,
+                    const batch::MultiVector<ValueType>* b,
+                    const batch::MultiVector<ValueType>* beta,
+                    batch::MultiVector<ValueType>* x) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
+
+
+}  // namespace batch_ell
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 21b573b6cd0..ccc88769a4e 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -36,6 +36,7 @@ set(GINKGO_HIP_SOURCES
     factorization/par_ilut_spgeam_kernel.hip.cpp
     factorization/par_ilut_sweep_kernel.hip.cpp
     matrix/batch_dense_kernels.hip.cpp
+    matrix/batch_ell_kernels.hip.cpp
     matrix/coo_kernels.hip.cpp
     ${CSR_INSTANTIATE}
     matrix/dense_kernels.hip.cpp
diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp
new file mode 100644
index 00000000000..c41b436daed
--- /dev/null
+++ b/hip/matrix/batch_ell_kernels.hip.cpp
@@ -0,0 +1,86 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/batch_dense_kernels.hpp"
+
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "cuda/base/batch_struct.hpp"
+#include "cuda/base/config.hpp"
+#include "cuda/base/cublas_bindings.hpp"
+#include "cuda/base/pointer_mode_guard.hpp"
+#include "cuda/base/thrust.cuh"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
+#include "cuda/components/uninitialized_array.hpp"
+#include "cuda/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The Ell matrix format namespace.
+ * @ref Ell
+ * @ingroup batch_ell
+ */
+namespace batch_ell {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_ell
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt
index d87399492f5..aa8e30cd590 100644
--- a/omp/CMakeLists.txt
+++ b/omp/CMakeLists.txt
@@ -24,6 +24,7 @@ target_sources(ginkgo_omp
     factorization/par_ilu_kernels.cpp
     factorization/par_ilut_kernels.cpp
     matrix/batch_dense_kernels.cpp
+    matrix/batch_ell_kernels.cpp
     matrix/coo_kernels.cpp
     matrix/csr_kernels.cpp
     matrix/dense_kernels.cpp
diff --git a/omp/matrix/batch_ell_kernels.cpp b/omp/matrix/batch_ell_kernels.cpp
new file mode 100644
index 00000000000..282920c05f3
--- /dev/null
+++ b/omp/matrix/batch_ell_kernels.cpp
@@ -0,0 +1,117 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/batch_ell_kernels.hpp"
+
+
+#include <algorithm>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+/**
+ * @brief The Ell matrix format namespace.
+ * @ref Ell
+ * @ingroup batch_ell
+ */
+namespace batch_ell {
+
+
+#include "reference/matrix/batch_ell_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const batch::matrix::Ell<ValueType, IndexType>* mat,
+                  const batch::MultiVector<ValueType>* b,
+                  batch::MultiVector<ValueType>* x)
+{
+    const auto b_ub = host::get_batch_struct(b);
+    const auto x_ub = host::get_batch_struct(x);
+    const auto mat_ub = host::get_batch_struct(mat);
+#pragma omp parallel for
+    for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
+        const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
+        const auto b_item = batch::extract_batch_item(b_ub, batch);
+        const auto x_item = batch::extract_batch_item(x_ub, batch);
+        simple_apply_kernel(mat_item, b_item, x_item);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
+                    const batch::MultiVector<ValueType>* alpha,
+                    const batch::matrix::Ell<ValueType, IndexType>* mat,
+                    const batch::MultiVector<ValueType>* b,
+                    const batch::MultiVector<ValueType>* beta,
+                    batch::MultiVector<ValueType>* x)
+{
+    const auto b_ub = host::get_batch_struct(b);
+    const auto x_ub = host::get_batch_struct(x);
+    const auto mat_ub = host::get_batch_struct(mat);
+    const auto alpha_ub = host::get_batch_struct(alpha);
+    const auto beta_ub = host::get_batch_struct(beta);
+#pragma omp parallel for
+    for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
+        const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
+        const auto b_item = batch::extract_batch_item(b_ub, batch);
+        const auto x_item = batch::extract_batch_item(x_ub, batch);
+        const auto alpha_item = batch::extract_batch_item(alpha_ub, batch);
+        const auto beta_item = batch::extract_batch_item(beta_ub, batch);
+        advanced_apply_kernel(alpha_item.values[0], mat_item, b_item,
+                              beta_item.values[0], x_item);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
+
+
+}  // namespace batch_ell
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index 37498588ca7..21dfc0dfb5a 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -26,6 +26,7 @@ target_sources(ginkgo_reference
     factorization/par_ilu_kernels.cpp
     factorization/par_ilut_kernels.cpp
     matrix/batch_dense_kernels.cpp
+    matrix/batch_ell_kernels.cpp
     matrix/coo_kernels.cpp
     matrix/csr_kernels.cpp
     matrix/dense_kernels.cpp
diff --git a/reference/matrix/batch_ell_kernels.cpp b/reference/matrix/batch_ell_kernels.cpp
new file mode 100644
index 00000000000..1fab322dc5f
--- /dev/null
+++ b/reference/matrix/batch_ell_kernels.cpp
@@ -0,0 +1,116 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/batch_ell_kernels.hpp"
+
+
+#include <algorithm>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/range_accessors.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+/**
+ * @brief The Ell matrix format namespace.
+ * @ref Ell
+ * @ingroup batch_ell
+ */
+namespace batch_ell {
+
+
+#include "reference/matrix/batch_ell_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const batch::matrix::Ell<ValueType, IndexType>* mat,
+                  const batch::MultiVector<ValueType>* b,
+                  batch::MultiVector<ValueType>* x)
+{
+    const auto b_ub = host::get_batch_struct(b);
+    const auto x_ub = host::get_batch_struct(x);
+    const auto mat_ub = host::get_batch_struct(mat);
+    for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
+        const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
+        const auto b_item = batch::extract_batch_item(b_ub, batch);
+        const auto x_item = batch::extract_batch_item(x_ub, batch);
+        simple_apply_kernel(mat_item, b_item, x_item);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
+                    const batch::MultiVector<ValueType>* alpha,
+                    const batch::matrix::Ell<ValueType, IndexType>* mat,
+                    const batch::MultiVector<ValueType>* b,
+                    const batch::MultiVector<ValueType>* beta,
+                    batch::MultiVector<ValueType>* x)
+{
+    const auto b_ub = host::get_batch_struct(b);
+    const auto x_ub = host::get_batch_struct(x);
+    const auto mat_ub = host::get_batch_struct(mat);
+    const auto alpha_ub = host::get_batch_struct(alpha);
+    const auto beta_ub = host::get_batch_struct(beta);
+    for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
+        const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
+        const auto b_item = batch::extract_batch_item(b_ub, batch);
+        const auto x_item = batch::extract_batch_item(x_ub, batch);
+        const auto alpha_item = batch::extract_batch_item(alpha_ub, batch);
+        const auto beta_item = batch::extract_batch_item(beta_ub, batch);
+        advanced_apply_kernel(alpha_item.values[0], mat_item, b_item,
+                              beta_item.values[0], x_item);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
+
+
+}  // namespace batch_ell
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/matrix/batch_ell_kernels.hpp.inc b/reference/matrix/batch_ell_kernels.hpp.inc
new file mode 100644
index 00000000000..1874d1db9f3
--- /dev/null
+++ b/reference/matrix/batch_ell_kernels.hpp.inc
@@ -0,0 +1,78 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename ValueType, typename IndexType>
+inline void simple_apply_kernel(
+    const gko::batch::matrix::batch_ell::batch_item<const ValueType,
+                                                    const IndexType>& a,
+    const gko::batch::multi_vector::batch_item<const ValueType>& b,
+    const gko::batch::multi_vector::batch_item<ValueType>& c)
+{
+    for (int row = 0; row < a.num_rows; ++row) {
+        for (int j = 0; j < b.num_rhs; ++j) {
+            c.values[row * c.stride + j] = zero<ValueType>();
+        }
+        for (auto k = 0; k < a.num_stored_elems_per_row; ++k) {
+            auto val = a.values[row + k * a.stride];
+            auto col = a.col_idxs[row + k * a.stride];
+            for (int j = 0; j < b.num_rhs; ++j) {
+                c.values[row * c.stride + j] +=
+                    val * b.values[col * b.stride + j];
+            }
+        }
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+inline void advanced_apply_kernel(
+    const ValueType alpha,
+    const gko::batch::matrix::batch_ell::batch_item<const ValueType,
+                                                    const IndexType>& a,
+    const gko::batch::multi_vector::batch_item<const ValueType>& b,
+    const ValueType beta,
+    const gko::batch::multi_vector::batch_item<ValueType>& c)
+{
+    for (int row = 0; row < a.num_rows; ++row) {
+        for (int j = 0; j < c.num_rhs; ++j) {
+            c.values[row * c.stride + j] *= beta;
+        }
+        for (auto k = 0; k < a.num_stored_elems_per_row; ++k) {
+            auto val = a.values[row + k * a.stride];
+            auto col = a.col_idxs[row + k * a.stride];
+            for (int j = 0; j < b.num_rhs; ++j) {
+                c.values[row * c.stride + j] +=
+                    alpha * val * b.values[col * b.stride + j];
+            }
+        }
+    }
+}

From 658a5af3bf652feb9d66d2ff97e9ef3829a9b7bd Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Fri, 6 Oct 2023 17:19:37 +0200
Subject: [PATCH 03/18] Use only int32

---
 .../matrix/batch_ell_kernel_launcher.hpp.inc  | 53 +++++++++++
 core/device_hooks/common_kernels.inc.cpp      | 10 +-
 core/matrix/batch_ell.cpp                     | 34 ++-----
 core/matrix/batch_ell_kernels.hpp             | 20 ++--
 core/matrix/batch_struct.hpp                  | 95 +++++++++++++++++++
 cuda/matrix/batch_ell_kernels.cu              |  4 +-
 dpcpp/matrix/batch_ell_kernels.dp.cpp         |  4 +-
 hip/matrix/batch_ell_kernels.hip.cpp          |  4 +-
 include/ginkgo/core/base/types.hpp            | 16 ++++
 include/ginkgo/core/matrix/batch_ell.hpp      | 31 +++---
 omp/matrix/batch_ell_kernels.cpp              |  4 +-
 reference/matrix/batch_ell_kernels.cpp        |  4 +-
 reference/matrix/batch_ell_kernels.hpp.inc    | 10 +-
 reference/matrix/batch_struct.hpp             | 35 +++++++
 14 files changed, 256 insertions(+), 68 deletions(-)
 create mode 100644 common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc

diff --git a/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc
new file mode 100644
index 00000000000..263e911c31a
--- /dev/null
+++ b/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc
@@ -0,0 +1,53 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename ValueType, typename IndexType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const batch::matrix::Ell<ValueType, IndexType>* mat,
+                  const batch::MultiVector<ValueType>* b,
+                  batch::MultiVector<ValueType>* x) GKO_NOT_IMPLEMENTED;
+
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+    GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
+                    const batch::MultiVector<ValueType>* alpha,
+                    const batch::matrix::Ell<ValueType, IndexType>* mat,
+                    const batch::MultiVector<ValueType>* b,
+                    const batch::MultiVector<ValueType>* beta,
+                    batch::MultiVector<ValueType>* x) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+    GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index b685063da10..462675c15db 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -58,6 +58,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/factorization/par_ilu_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
 #include "core/matrix/batch_dense_kernels.hpp"
+#include "core/matrix/batch_ell_kernels.hpp"
 #include "core/matrix/coo_kernels.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/matrix/dense_kernels.hpp"
@@ -137,6 +138,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
     GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro)
 
+#define GKO_STUB_VALUE_AND_INT32_TYPE(_macro)                       \
+    template <typename ValueType, typename IndexType>               \
+    _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro)
+
 #define GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(_macro)                     \
     template <typename InputValueType, typename MatrixValueType,        \
               typename OutputValueType, typename IndexType>             \
@@ -313,8 +319,8 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL);
 namespace batch_ell {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
 
 
 }  // namespace batch_ell
diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp
index 63d4f0dda8a..3aea6e1aae4 100644
--- a/core/matrix/batch_ell.cpp
+++ b/core/matrix/batch_ell.cpp
@@ -64,24 +64,6 @@ GKO_REGISTER_OPERATION(advanced_apply, batch_ell::advanced_apply);
 }  // namespace ell
 
 
-namespace detail {
-
-
-template <typename ValueType, typename IndexType>
-batch_dim<2> compute_batch_size(
-    const std::vector<gko::matrix::Ell<ValueType, IndexType>*>& matrices)
-{
-    auto common_size = matrices[0]->get_size();
-    for (size_type i = 1; i < matrices.size(); ++i) {
-        GKO_ASSERT_EQUAL_DIMENSIONS(common_size, matrices[i]->get_size());
-    }
-    return batch_dim<2>{matrices.size(), common_size};
-}
-
-
-}  // namespace detail
-
-
 template <typename ValueType, typename IndexType>
 std::unique_ptr<gko::matrix::Ell<ValueType, IndexType>>
 Ell<ValueType, IndexType>::create_view_for_item(size_type item_id)
@@ -145,7 +127,8 @@ template <typename ValueType, typename IndexType>
 std::unique_ptr<const Ell<ValueType, IndexType>>
 Ell<ValueType, IndexType>::create_const(
     std::shared_ptr<const Executor> exec, const batch_dim<2>& sizes,
-    int num_elems_per_row, gko::detail::const_array_view<ValueType>&& values,
+    const IndexType num_elems_per_row,
+    gko::detail::const_array_view<ValueType>&& values,
     gko::detail::const_array_view<IndexType>&& col_idxs)
 {
     // cast const-ness away, but return a const object afterwards,
@@ -166,7 +149,8 @@ inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes)
 
 template <typename ValueType, typename IndexType>
 Ell<ValueType, IndexType>::Ell(std::shared_ptr<const Executor> exec,
-                               const batch_dim<2>& size, int num_elems_per_row)
+                               const batch_dim<2>& size,
+                               IndexType num_elems_per_row)
     : EnableBatchLinOp<Ell<ValueType, IndexType>>(exec, size),
       num_elems_per_row_(num_elems_per_row),
       values_(exec, compute_num_elems(size, num_elems_per_row)),
@@ -209,7 +193,7 @@ void Ell<ValueType, IndexType>::apply_impl(const MultiVector<ValueType>* alpha,
 
 template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::convert_to(
-    Ell<next_precision<ValueType, IndexType>>* result) const
+    Ell<next_precision<ValueType>, IndexType>* result) const
 {
     result->values_ = this->values_;
     result->col_idxs_ = this->col_idxs_;
@@ -218,16 +202,16 @@ void Ell<ValueType, IndexType>::convert_to(
 }
 
 
-template <typename ValueType>
+template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::move_to(
-    Ell<next_precision<ValueType, IndexType>>* result)
+    Ell<next_precision<ValueType>, IndexType>* result)
 {
     this->convert_to(result);
 }
 
 
-#define GKO_DECLARE_BATCH_ELL_MATRIX(_type) class Ell<_vtype, _itype>
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_BATCH_ELL_MATRIX);
+#define GKO_DECLARE_BATCH_ELL_MATRIX(ValueType) class Ell<ValueType, int32>
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_ELL_MATRIX);
 
 
 }  // namespace matrix
diff --git a/core/matrix/batch_ell_kernels.hpp b/core/matrix/batch_ell_kernels.hpp
index 1b1ef345ae0..d3acc582f9b 100644
--- a/core/matrix/batch_ell_kernels.hpp
+++ b/core/matrix/batch_ell_kernels.hpp
@@ -52,16 +52,16 @@ namespace kernels {
 #define GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL(_vtype, _itype)  \
     void simple_apply(std::shared_ptr<const DefaultExecutor> exec, \
                       const batch::matrix::Ell<_vtype, _itype>* a, \
-                      const batch::MultiVector<_vtype, _itype>* b, \
-                      batch::MultiVector<_vtype, _itype>* c)
-
-#define GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL(_vtype, _itype)      \
-    void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,     \
-                        const batch::MultiVector<_vtype, _itype>* alpha, \
-                        const batch::matrix::Ell<_vtype, _itype>* a,     \
-                        const batch::MultiVector<_vtype, _itype>* b,     \
-                        const batch::MultiVector<_vtype, _itype>* beta,  \
-                        batch::MultiVector<_vtype, _itype>* c)
+                      const batch::MultiVector<_vtype>* b,         \
+                      batch::MultiVector<_vtype>* c)
+
+#define GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL(_vtype, _itype)  \
+    void advanced_apply(std::shared_ptr<const DefaultExecutor> exec, \
+                        const batch::MultiVector<_vtype>* alpha,     \
+                        const batch::matrix::Ell<_vtype, _itype>* a, \
+                        const batch::MultiVector<_vtype>* b,         \
+                        const batch::MultiVector<_vtype>* beta,      \
+                        batch::MultiVector<_vtype>* c)
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                                 \
     template <typename ValueType, typename IndexType>                \
diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp
index 0bbfde40cc9..272bb506df2 100644
--- a/core/matrix/batch_struct.hpp
+++ b/core/matrix/batch_struct.hpp
@@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
 
 
 namespace gko {
@@ -82,6 +83,53 @@ struct uniform_batch {
 }  // namespace dense
 
 
+namespace batch_ell {
+
+
+/**
+ * Encapsulates one matrix from a batch of ell matrices.
+ */
+template <typename ValueType>
+struct batch_item {
+    using value_type = ValueType;
+    using index_type = int32;
+
+    ValueType* values;
+    const index_type* col_idxs;
+    index_type stride;
+    index_type num_rows;
+    index_type num_cols;
+    index_type num_stored_elems_per_row;
+};
+
+
+/**
+ * A 'simple' structure to store a global uniform batch of ell matrices.
+ */
+template <typename ValueType>
+struct uniform_batch {
+    using value_type = ValueType;
+    using index_type = int;
+    using entry_type = batch_item<value_type>;
+
+    ValueType* values;
+    const index_type* col_idxs;
+    size_type num_batch_items;
+    index_type stride;
+    index_type num_rows;
+    index_type num_cols;
+    index_type num_stored_elems_per_row;
+
+    size_type get_entry_storage() const
+    {
+        return num_rows * num_stored_elems_per_row * sizeof(value_type);
+    }
+};
+
+
+}  // namespace batch_ell
+
+
 template <typename ValueType>
 GKO_ATTRIBUTES GKO_INLINE dense::batch_item<const ValueType> to_const(
     const dense::batch_item<ValueType>& b)
@@ -116,6 +164,53 @@ GKO_ATTRIBUTES GKO_INLINE dense::batch_item<ValueType> extract_batch_item(
 }
 
 
+template <typename ValueType>
+GKO_ATTRIBUTES GKO_INLINE batch_ell::batch_item<const ValueType> to_const(
+    const batch_ell::batch_item<ValueType>& b)
+{
+    return {b.values,   b.col_idxs, b.stride,
+            b.num_rows, b.num_cols, b.num_stored_elems_per_row};
+}
+
+
+template <typename ValueType>
+GKO_ATTRIBUTES GKO_INLINE batch_ell::uniform_batch<const ValueType> to_const(
+    const batch_ell::uniform_batch<ValueType>& ub)
+{
+    return {ub.values,   ub.col_idxs, ub.num_batch_items,         ub.stride,
+            ub.num_rows, ub.num_cols, ub.num_stored_elems_per_row};
+}
+
+
+template <typename ValueType>
+GKO_ATTRIBUTES GKO_INLINE batch_ell::batch_item<ValueType> extract_batch_item(
+    const batch_ell::uniform_batch<ValueType>& batch, const size_type batch_idx)
+{
+    return {batch.values +
+                batch_idx * batch.num_stored_elems_per_row * batch.num_rows,
+            batch.col_idxs +
+                batch_idx * batch.num_stored_elems_per_row * batch.num_rows,
+            batch.stride,
+            batch.num_rows,
+            batch.num_cols,
+            batch.num_stored_elems_per_row};
+}
+
+template <typename ValueType>
+GKO_ATTRIBUTES GKO_INLINE batch_ell::batch_item<ValueType> extract_batch_item(
+    ValueType* const batch_values, int* const batch_col_idxs, const int stride,
+    const int num_rows, const int num_cols, int num_elems_per_row,
+    const size_type batch_idx)
+{
+    return {batch_values + batch_idx * num_elems_per_row * num_rows,
+            batch_col_idxs + batch_idx * num_elems_per_row * num_rows,
+            stride,
+            num_rows,
+            num_cols,
+            num_elems_per_row};
+}
+
+
 }  // namespace matrix
 }  // namespace batch
 }  // namespace gko
diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu
index c41b436daed..567d863d95c 100644
--- a/cuda/matrix/batch_ell_kernels.cu
+++ b/cuda/matrix/batch_ell_kernels.cu
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/matrix/batch_dense_kernels.hpp"
+#include "core/matrix/batch_ell_kernels.hpp"
 
 
 #include <thrust/functional.h>
@@ -72,7 +72,7 @@ constexpr int sm_oversubscription = 4;
 
 // NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
 
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
+// #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
 
 
 #include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp
index f886b7dd790..cdcd5abd024 100644
--- a/dpcpp/matrix/batch_ell_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp
@@ -80,7 +80,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::MultiVector<ValueType>* b,
                   batch::MultiVector<ValueType>* x) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
 
 
@@ -92,7 +92,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
                     const batch::MultiVector<ValueType>* beta,
                     batch::MultiVector<ValueType>* x) GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
 
 
diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp
index c41b436daed..567d863d95c 100644
--- a/hip/matrix/batch_ell_kernels.hip.cpp
+++ b/hip/matrix/batch_ell_kernels.hip.cpp
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/matrix/batch_dense_kernels.hpp"
+#include "core/matrix/batch_ell_kernels.hpp"
 
 
 #include <thrust/functional.h>
@@ -72,7 +72,7 @@ constexpr int sm_oversubscription = 4;
 
 // NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
 
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
+// #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
 
 
 #include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp
index 68b5da6e3eb..f5a75c7448e 100644
--- a/include/ginkgo/core/base/types.hpp
+++ b/include/ginkgo/core/base/types.hpp
@@ -531,6 +531,22 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template _macro(double, int64)
 #endif
 
+#if GINKGO_DPCPP_SINGLE_MODE
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro) \
+    template _macro(float, int32);                            \
+    template <>                                               \
+    _macro(double, int32) GKO_NOT_IMPLEMENTED;                \
+    template _macro(std::complex<float>, int32);              \
+    template <>                                               \
+    _macro(std::complex<double>, int32) GKO_NOT_IMPLEMENTED
+#else
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro) \
+    template _macro(float, int32);                            \
+    template _macro(double, int32);                           \
+    template _macro(std::complex<float>, int32);              \
+    template _macro(std::complex<double>, int32)
+#endif
+
 
 /**
  * Instantiates a template for each value and index type compiled by Ginkgo.
diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp
index 374f1479664..af77fc1e390 100644
--- a/include/ginkgo/core/matrix/batch_ell.hpp
+++ b/include/ginkgo/core/matrix/batch_ell.hpp
@@ -88,7 +88,7 @@ class Ell final
     using EnableBatchLinOp<Ell>::move_to;
 
     using value_type = ValueType;
-    using index_type = int32;
+    using index_type = IndexType;
     using transposed_type = Ell<ValueType, IndexType>;
     using unbatch_type = gko::matrix::Ell<ValueType, IndexType>;
     using absolute_type = remove_complex<Ell>;
@@ -170,7 +170,7 @@ class Ell final
      * @return the number of elements stored in each row of the ELL matrix. Same
      * for each batch item
      */
-    int get_num_stored_elements_per_row() const noexcept
+    index_type get_num_stored_elements_per_row() const noexcept
     {
         return num_elems_per_row_;
     }
@@ -205,7 +205,7 @@ class Ell final
      *
      * @return the pointer to the array of col_idxs
      */
-    value_type* get_col_idxs_for_item(size_type batch_id) noexcept
+    index_type* get_col_idxs_for_item(size_type batch_id) noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
         return col_idxs_.get_data() +
@@ -219,8 +219,8 @@ class Ell final
      *       significantly more memory efficient than the non-constant version,
      *       so always prefer this version.
      */
-    const value_type* get_const_col_idxs_for_item(
-        size_type batch_id) const noexcept
+    const index_type* get_const_col_idxs_for_item(size_type batch_id) const
+        noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
         return col_idxs_.get_const_data() +
@@ -249,8 +249,8 @@ class Ell final
      *       significantly more memory efficient than the non-constant version,
      *       so always prefer this version.
      */
-    const value_type* get_const_values_for_item(
-        size_type batch_id) const noexcept
+    const value_type* get_const_values_for_item(size_type batch_id) const
+        noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
         return values_.get_const_data() +
@@ -271,9 +271,9 @@ class Ell final
      * array (if it resides on the same executor as the matrix) or a copy of the
      * array on the correct executor.
      */
-    static std::unique_ptr<const Ell<value_type, index_type>> create_const(
+    static std::unique_ptr<const Ell> create_const(
         std::shared_ptr<const Executor> exec, const batch_dim<2>& sizes,
-        const int num_elems_per_row,
+        const index_type num_elems_per_row,
         gko::detail::const_array_view<ValueType>&& values,
         gko::detail::const_array_view<IndexType>&& col_idxs);
 
@@ -309,9 +309,10 @@ class Ell final
     }
 
 private:
-    size_type compute_num_elems(const batch_dim<2>& size, int num_elems_per_row)
+    size_type compute_num_elems(const batch_dim<2>& size,
+                                IndexType num_elems_per_row)
     {
-        return size->get_common_size()[0] * num_elems_per_row;
+        return size.get_common_size()[0] * num_elems_per_row;
     }
 
 
@@ -325,7 +326,7 @@ class Ell final
      */
     Ell(std::shared_ptr<const Executor> exec,
         const batch_dim<2>& size = batch_dim<2>{},
-        const int num_elems_per_row = 0);
+        const IndexType num_elems_per_row = 0);
 
     /**
      * Creates a Ell matrix from an already allocated (and initialized)
@@ -345,7 +346,7 @@ class Ell final
      */
     template <typename ValuesArray, typename IndicesArray>
     Ell(std::shared_ptr<const Executor> exec, const batch_dim<2>& size,
-        const int num_elems_per_row, ValuesArray&& values,
+        const IndexType num_elems_per_row, ValuesArray&& values,
         IndicesArray&& col_idxs)
         : EnableBatchLinOp<Ell>(exec, size),
           num_elems_per_row_{num_elems_per_row},
@@ -353,7 +354,7 @@ class Ell final
           col_idxs_{exec, std::forward<IndicesArray>(col_idxs)}
     {
         // Ensure that the value and col_idxs arrays have the correct size
-        auto num_elems = this->get_size()[0] * num_elems_per_row() *
+        auto num_elems = this->get_common_size()[0] * num_elems_per_row *
                          this->get_num_batch_items();
         GKO_ENSURE_IN_BOUNDS(num_elems, values_.get_num_elems() + 1);
         GKO_ENSURE_IN_BOUNDS(num_elems, col_idxs_.get_num_elems() + 1);
@@ -376,7 +377,7 @@ class Ell final
                     MultiVector<value_type>* x) const;
 
 private:
-    int num_elems_per_row_;
+    index_type num_elems_per_row_;
     array<value_type> values_;
     array<index_type> col_idxs_;
 };
diff --git a/omp/matrix/batch_ell_kernels.cpp b/omp/matrix/batch_ell_kernels.cpp
index 282920c05f3..20ea4614e7d 100644
--- a/omp/matrix/batch_ell_kernels.cpp
+++ b/omp/matrix/batch_ell_kernels.cpp
@@ -78,7 +78,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
 
 
@@ -107,7 +107,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
 
 
diff --git a/reference/matrix/batch_ell_kernels.cpp b/reference/matrix/batch_ell_kernels.cpp
index 1fab322dc5f..a3f69827c02 100644
--- a/reference/matrix/batch_ell_kernels.cpp
+++ b/reference/matrix/batch_ell_kernels.cpp
@@ -78,7 +78,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
 
 
@@ -106,7 +106,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
 
 
diff --git a/reference/matrix/batch_ell_kernels.hpp.inc b/reference/matrix/batch_ell_kernels.hpp.inc
index 1874d1db9f3..37370261d44 100644
--- a/reference/matrix/batch_ell_kernels.hpp.inc
+++ b/reference/matrix/batch_ell_kernels.hpp.inc
@@ -30,10 +30,9 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-template <typename ValueType, typename IndexType>
+template <typename ValueType>
 inline void simple_apply_kernel(
-    const gko::batch::matrix::batch_ell::batch_item<const ValueType,
-                                                    const IndexType>& a,
+    const gko::batch::matrix::batch_ell::batch_item<const ValueType>& a,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
     const gko::batch::multi_vector::batch_item<ValueType>& c)
 {
@@ -53,11 +52,10 @@ inline void simple_apply_kernel(
 }
 
 
-template <typename ValueType, typename IndexType>
+template <typename ValueType>
 inline void advanced_apply_kernel(
     const ValueType alpha,
-    const gko::batch::matrix::batch_ell::batch_item<const ValueType,
-                                                    const IndexType>& a,
+    const gko::batch::matrix::batch_ell::batch_item<const ValueType>& a,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
     const ValueType beta,
     const gko::batch::multi_vector::batch_item<ValueType>& c)
diff --git a/reference/matrix/batch_struct.hpp b/reference/matrix/batch_struct.hpp
index 483d7717718..b5eacd80d18 100644
--- a/reference/matrix/batch_struct.hpp
+++ b/reference/matrix/batch_struct.hpp
@@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
 
 
 #include "core/base/batch_struct.hpp"
@@ -90,6 +91,40 @@ inline batch::matrix::dense::uniform_batch<ValueType> get_batch_struct(
 }
 
 
+/**
+ * Generates an immutable uniform batch struct from a batch of ell matrices.
+ */
+template <typename ValueType>
+inline batch::matrix::batch_ell::uniform_batch<const ValueType>
+get_batch_struct(const batch::matrix::Ell<ValueType, int32>* const op)
+{
+    return {op->get_const_values(),
+            op->get_const_col_idxs(),
+            op->get_num_batch_items(),
+            static_cast<int>(op->get_common_size()[1]),
+            static_cast<int>(op->get_common_size()[0]),
+            static_cast<int>(op->get_common_size()[1]),
+            static_cast<int>(op->get_num_stored_elements_per_row())};
+}
+
+
+/**
+ * Generates a uniform batch struct from a batch of ell matrices.
+ */
+template <typename ValueType>
+inline batch::matrix::batch_ell::uniform_batch<ValueType> get_batch_struct(
+    batch::matrix::Ell<ValueType, int32>* const op)
+{
+    return {op->get_values(),
+            op->get_col_idxs(),
+            op->get_num_batch_items(),
+            static_cast<int>(op->get_common_size()[1]),
+            static_cast<int>(op->get_common_size()[0]),
+            static_cast<int>(op->get_common_size()[1]),
+            static_cast<int>(op->get_num_stored_elements_per_row())};
+}
+
+
 }  // namespace host
 }  // namespace kernels
 }  // namespace gko

From 1c3c5ff9a84cd3a5afee8bbbd219a1de3909a2b4 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Sat, 7 Oct 2023 17:31:07 +0200
Subject: [PATCH 04/18] Generalize and rewrite batch utils

---
 core/base/batch_utilities.hpp                 | 273 ++++++++++++++-
 core/matrix/batch_ell.cpp                     |  15 +-
 core/test/matrix/batch_ell.cpp                | 330 +++++++++---------
 .../ginkgo/core/base/batch_multi_vector.hpp   | 222 +-----------
 include/ginkgo/core/matrix/batch_ell.hpp      |  18 +-
 5 files changed, 449 insertions(+), 409 deletions(-)

diff --git a/core/base/batch_utilities.hpp b/core/base/batch_utilities.hpp
index 834e89c8358..c37c0cae721 100644
--- a/core/base/batch_utilities.hpp
+++ b/core/base/batch_utilities.hpp
@@ -39,7 +39,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -53,15 +52,18 @@ namespace gko {
 namespace batch {
 
 
-template <typename OutputType>
+template <typename OutputType, typename... TArgs>
 std::unique_ptr<OutputType> duplicate(std::shared_ptr<const Executor> exec,
                                       size_type num_duplications,
-                                      const OutputType* input)
+                                      const OutputType* input,
+                                      TArgs&&... create_args)
 {
     auto num_batch_items = input->get_num_batch_items();
-    auto tmp = OutputType::create(
-        exec, batch_dim<2>(num_batch_items * num_duplications,
-                           input->get_common_size()));
+    auto tmp =
+        OutputType::create(exec,
+                           batch_dim<2>(num_batch_items * num_duplications,
+                                        input->get_common_size()),
+                           std::forward<TArgs>(create_args)...);
 
     for (size_type i = 0; i < num_duplications; ++i) {
         for (size_type b = 0; b < num_batch_items; ++b) {
@@ -74,14 +76,15 @@ std::unique_ptr<OutputType> duplicate(std::shared_ptr<const Executor> exec,
 }
 
 
-template <typename OutputType>
+template <typename OutputType, typename... TArgs>
 std::unique_ptr<OutputType> create_from_item(
     std::shared_ptr<const Executor> exec, const size_type num_duplications,
-    const typename OutputType::unbatch_type* input)
+    const typename OutputType::unbatch_type* input, TArgs&&... create_args)
 {
     auto num_batch_items = num_duplications;
     auto tmp = OutputType::create(
-        exec, batch_dim<2>(num_batch_items, input->get_size()));
+        exec, batch_dim<2>(num_batch_items, input->get_size()),
+        std::forward<TArgs>(create_args)...);
 
     for (size_type b = 0; b < num_batch_items; ++b) {
         tmp->create_view_for_item(b)->copy_from(input);
@@ -91,14 +94,16 @@ std::unique_ptr<OutputType> create_from_item(
 }
 
 
-template <typename OutputType>
+template <typename OutputType, typename... TArgs>
 std::unique_ptr<OutputType> create_from_item(
     std::shared_ptr<const Executor> exec,
-    const std::vector<typename OutputType::unbatch_type*>& input)
+    const std::vector<typename OutputType::unbatch_type*>& input,
+    TArgs&&... create_args)
 {
     auto num_batch_items = input.size();
     auto tmp = OutputType::create(
-        exec, batch_dim<2>(num_batch_items, input[0]->get_size()));
+        exec, batch_dim<2>(num_batch_items, input[0]->get_size()),
+        std::forward<TArgs>(create_args)...);
 
     for (size_type b = 0; b < num_batch_items; ++b) {
         tmp->create_view_for_item(b)->copy_from(input[b]);
@@ -121,14 +126,17 @@ auto unbatch(const InputType* batch_object)
 }
 
 
-template <typename ValueType, typename IndexType, typename OutputType>
+template <typename ValueType, typename IndexType, typename OutputType,
+          typename... TArgs>
 std::unique_ptr<OutputType> read(
     std::shared_ptr<const Executor> exec,
-    const std::vector<gko::matrix_data<ValueType, IndexType>>& data)
+    const std::vector<gko::matrix_data<ValueType, IndexType>>& data,
+    TArgs&&... create_args)
 {
     auto num_batch_items = data.size();
     auto tmp =
-        OutputType::create(exec, batch_dim<2>(num_batch_items, data[0].size));
+        OutputType::create(exec, batch_dim<2>(num_batch_items, data[0].size),
+                           std::forward<TArgs>(create_args)...);
 
     for (size_type b = 0; b < num_batch_items; ++b) {
         tmp->create_view_for_item(b)->read(data[b]);
@@ -154,6 +162,241 @@ std::vector<gko::matrix_data<ValueType, IndexType>> write(
 }
 
 
+/**
+ * Creates and initializes a batch of single column-vectors.
+ *
+ * This function first creates a temporary MultiVector, fills it with
+ * passed in values, and then converts the vector to the requested type.
+ *
+ * @tparam Matrix  matrix type to initialize
+ *                 (MultiVector has to implement the ConvertibleTo<Matrix>
+ *                 interface)
+ * @tparam TArgs  argument types for Matrix::create method
+ *                (not including the implied Executor as the first argument)
+ *
+ * @param vals  values used to initialize the batch vector
+ * @param exec  Executor associated to the vector
+ * @param create_args  additional arguments passed to Matrix::create, not
+ *                     including the Executor, which is passed as the first
+ *                     argument
+ *
+ * @ingroup MultiVector
+ * @ingroup mat_formats
+ */
+template <typename Matrix, typename... TArgs>
+std::unique_ptr<Matrix> initialize(
+    std::initializer_list<std::initializer_list<typename Matrix::value_type>>
+        vals,
+    std::shared_ptr<const Executor> exec, TArgs&&... create_args)
+{
+    using value_type = typename Matrix::value_type;
+    using index_type = typename Matrix::index_type;
+    using mat_data = gko::matrix_data<value_type, index_type>;
+    size_type num_batch_items = vals.size();
+    GKO_THROW_IF_INVALID(num_batch_items > 0, "Input data is empty");
+    auto vals_begin = begin(vals);
+    size_type common_num_rows = vals_begin ? vals_begin->size() : 0;
+    auto common_size = dim<2>(common_num_rows, 1);
+    for (auto& val : vals) {
+        GKO_ASSERT_EQ(common_num_rows, val.size());
+    }
+    auto b_size = batch_dim<2>(num_batch_items, common_size);
+    size_type batch = 0;
+    std::vector<mat_data> input_mat_data(num_batch_items, common_size);
+    for (const auto& b : vals) {
+        input_mat_data[batch].nonzeros.reserve(b.size());
+        size_type idx = 0;
+        for (const auto& elem : b) {
+            if (elem != zero<value_type>()) {
+                input_mat_data[batch].nonzeros.emplace_back(idx, 0, elem);
+            }
+            ++idx;
+        }
+        ++batch;
+    }
+    return read<value_type, index_type, Matrix>(
+        exec, input_mat_data, std::forward<TArgs>(create_args)...);
+}
+
+
+/**
+ * Creates and initializes a batch of multi-vectors.
+ *
+ * This function first creates a temporary MultiVector, fills it with
+ * passed in values, and then converts the vector to the requested type.
+ *
+ * @tparam Matrix  matrix type to initialize
+ *                 (Dense has to implement the ConvertibleTo<Matrix> interface)
+ * @tparam TArgs  argument types for Matrix::create method
+ *                (not including the implied Executor as the first argument)
+ *
+ * @param vals  values used to initialize the vector
+ * @param exec  Executor associated to the vector
+ * @param create_args  additional arguments passed to Matrix::create, not
+ *                     including the Executor, which is passed as the first
+ *                     argument
+ *
+ * @ingroup MultiVector
+ * @ingroup mat_formats
+ */
+template <typename Matrix, typename... TArgs>
+std::unique_ptr<Matrix> initialize(
+    std::initializer_list<std::initializer_list<
+        std::initializer_list<typename Matrix::value_type>>>
+        vals,
+    std::shared_ptr<const Executor> exec, TArgs&&... create_args)
+{
+    using value_type = typename Matrix::value_type;
+    using index_type = typename Matrix::index_type;
+    using mat_data = gko::matrix_data<value_type, index_type>;
+    size_type num_batch_items = vals.size();
+    GKO_THROW_IF_INVALID(num_batch_items > 0, "Input data is empty");
+    auto vals_begin = begin(vals);
+    size_type common_num_rows = vals_begin ? vals_begin->size() : 0;
+    size_type common_num_cols =
+        vals_begin->begin() ? vals_begin->begin()->size() : 0;
+    auto common_size = dim<2>(common_num_rows, common_num_cols);
+    for (const auto& b : vals) {
+        auto num_rows = b.size();
+        auto num_cols = begin(b)->size();
+        auto b_size = dim<2>(num_rows, num_cols);
+        GKO_ASSERT_EQUAL_DIMENSIONS(b_size, common_size);
+    }
+
+    auto b_size = batch_dim<2>(num_batch_items, common_size);
+    size_type batch = 0;
+    std::vector<mat_data> input_mat_data(num_batch_items, common_size);
+    for (const auto& b : vals) {
+        size_type ridx = 0;
+        for (const auto& row : b) {
+            size_type cidx = 0;
+            for (const auto& elem : row) {
+                if (elem != zero<value_type>()) {
+                    input_mat_data[batch].nonzeros.emplace_back(ridx, cidx,
+                                                                elem);
+                }
+                ++cidx;
+            }
+            ++ridx;
+        }
+        ++batch;
+    }
+    return read<value_type, index_type, Matrix>(
+        exec, input_mat_data, std::forward<TArgs>(create_args)...);
+}
+
+
+/**
+ * Creates and initializes a batch single column-vector by making copies of the
+ * single input column vector.
+ *
+ * This function first creates a temporary batch multi-vector, fills it with
+ * passed in values, and then converts the vector to the requested type.
+ *
+ * @tparam Matrix  matrix type to initialize
+ *                 (MultiVector has to implement the ConvertibleTo<Matrix>
+ *                  interface)
+ * @tparam TArgs  argument types for Matrix::create method
+ *                (not including the implied Executor as the first argument)
+ *
+ * @param num_vectors  The number of times the input vector is to be duplicated
+ * @param vals  values used to initialize each vector in the temp. batch
+ * @param exec  Executor associated to the vector
+ * @param create_args  additional arguments passed to Matrix::create, not
+ *                     including the Executor, which is passed as the first
+ *                     argument
+ *
+ * @ingroup MultiVector
+ * @ingroup mat_formats
+ */
+template <typename Matrix, typename... TArgs>
+std::unique_ptr<Matrix> initialize(
+    const size_type num_vectors,
+    std::initializer_list<typename Matrix::value_type> vals,
+    std::shared_ptr<const Executor> exec, TArgs&&... create_args)
+{
+    using value_type = typename Matrix::value_type;
+    using index_type = typename Matrix::index_type;
+    using mat_data = gko::matrix_data<value_type, index_type>;
+    size_type num_batch_items = num_vectors;
+    GKO_THROW_IF_INVALID(num_batch_items > 0 && vals.size() > 0,
+                         "Input data is empty");
+    auto num_rows = begin(vals) ? vals.size() : 0;
+    auto common_size = dim<2>(num_rows, 1);
+    auto b_size = batch_dim<2>(num_batch_items, common_size);
+    std::vector<mat_data> input_mat_data(num_batch_items, common_size);
+    for (size_type batch = 0; batch < num_vectors; batch++) {
+        input_mat_data[batch].nonzeros.reserve(num_rows);
+        size_type idx = 0;
+        for (const auto& elem : vals) {
+            if (elem != zero<value_type>()) {
+                input_mat_data[batch].nonzeros.emplace_back(idx, 0, elem);
+            }
+            ++idx;
+        }
+    }
+    return read<value_type, index_type, Matrix>(
+        exec, input_mat_data, std::forward<TArgs>(create_args)...);
+}
+
+
+/**
+ * Creates and initializes a matrix from copies of a given matrix.
+ *
+ * This function first creates a temporary batch multi-vector, fills it with
+ * passed in values, and then converts the vector to the requested type.
+ *
+ * @tparam Matrix  matrix type to initialize
+ *                 (MultiVector has to implement the ConvertibleTo<Matrix>
+ *                  interface)
+ * @tparam TArgs  argument types for Matrix::create method
+ *                (not including the implied Executor as the first argument)
+ *
+ * @param num_batch_items The number of times the input matrix is duplicated
+ * @param vals  values used to initialize each vector in the temp. batch
+ * @param exec  Executor associated to the vector
+ * @param create_args  additional arguments passed to Matrix::create, not
+ *                     including the Executor, which is passed as the first
+ *                     argument
+ *
+ * @ingroup LinOp
+ * @ingroup mat_formats
+ */
+template <typename Matrix, typename... TArgs>
+std::unique_ptr<Matrix> initialize(
+    const size_type num_batch_items,
+    std::initializer_list<std::initializer_list<typename Matrix::value_type>>
+        vals,
+    std::shared_ptr<const Executor> exec, TArgs&&... create_args)
+{
+    using value_type = typename Matrix::value_type;
+    using index_type = typename Matrix::index_type;
+    using mat_data = gko::matrix_data<value_type, index_type>;
+    GKO_THROW_IF_INVALID(num_batch_items > 0 && vals.size() > 0,
+                         "Input data is empty");
+    auto common_size = dim<2>(begin(vals) ? vals.size() : 0,
+                              begin(vals) ? begin(vals)->size() : 0);
+    batch_dim<2> b_size(num_batch_items, common_size);
+    std::vector<mat_data> input_mat_data(num_batch_items, common_size);
+    for (size_type batch = 0; batch < num_batch_items; batch++) {
+        size_type ridx = 0;
+        for (const auto& row : vals) {
+            size_type cidx = 0;
+            for (const auto& elem : row) {
+                if (elem != zero<value_type>()) {
+                    input_mat_data[batch].nonzeros.emplace_back(ridx, cidx,
+                                                                elem);
+                }
+                ++cidx;
+            }
+            ++ridx;
+        }
+    }
+    return read<value_type, index_type, Matrix>(
+        exec, input_mat_data, std::forward<TArgs>(create_args)...);
+}
+
+
 }  // namespace batch
 }  // namespace gko
 
diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp
index 3aea6e1aae4..0d903b10968 100644
--- a/core/matrix/batch_ell.cpp
+++ b/core/matrix/batch_ell.cpp
@@ -70,13 +70,13 @@ Ell<ValueType, IndexType>::create_view_for_item(size_type item_id)
 {
     auto exec = this->get_executor();
     auto num_rows = this->get_common_size()[0];
-    auto stride = this->get_common_size()[1];
+    auto stride = this->get_common_size()[0];
     auto mat = unbatch_type::create(
         exec, this->get_common_size(),
         make_array_view(exec, this->get_num_elements_per_item(),
                         this->get_values_for_item(item_id)),
         make_array_view(exec, this->get_num_elements_per_item(),
-                        this->get_col_idxs_for_item(item_id)),
+                        this->get_col_idxs()),
         this->get_num_stored_elements_per_row(), stride);
     return mat;
 }
@@ -88,13 +88,13 @@ Ell<ValueType, IndexType>::create_const_view_for_item(size_type item_id) const
 {
     auto exec = this->get_executor();
     auto num_rows = this->get_common_size()[0];
-    auto stride = this->get_common_size()[1];
+    auto stride = this->get_common_size()[0];
     auto mat = unbatch_type::create_const(
         exec, this->get_common_size(),
         make_const_array_view(exec, this->get_num_elements_per_item(),
                               this->get_const_values_for_item(item_id)),
         make_const_array_view(exec, this->get_num_elements_per_item(),
-                              this->get_const_col_idxs_for_item(item_id)),
+                              this->get_const_col_idxs()),
         this->get_num_stored_elements_per_row(), stride);
     return mat;
 }
@@ -152,9 +152,10 @@ Ell<ValueType, IndexType>::Ell(std::shared_ptr<const Executor> exec,
                                const batch_dim<2>& size,
                                IndexType num_elems_per_row)
     : EnableBatchLinOp<Ell<ValueType, IndexType>>(exec, size),
-      num_elems_per_row_(num_elems_per_row),
-      values_(exec, compute_num_elems(size, num_elems_per_row)),
-      col_idxs_(exec, compute_num_elems(size, num_elems_per_row))
+      num_elems_per_row_(num_elems_per_row == 0 ? size.get_common_size()[1]
+                                                : num_elems_per_row),
+      values_(exec, compute_num_elems(size, num_elems_per_row_)),
+      col_idxs_(exec, this->get_common_size()[0] * num_elems_per_row_)
 {}
 
 
diff --git a/core/test/matrix/batch_ell.cpp b/core/test/matrix/batch_ell.cpp
index 931efb47d2e..2830705bf5f 100644
--- a/core/test/matrix/batch_ell.cpp
+++ b/core/test/matrix/batch_ell.cpp
@@ -51,6 +51,7 @@ template <typename T>
 class Ell : public ::testing::Test {
 protected:
     using value_type = T;
+    using index_type = gko::int32;
     using EllMtx = gko::matrix::Ell<value_type>;
     using size_type = gko::size_type;
     Ell()
@@ -58,46 +59,71 @@ class Ell : public ::testing::Test {
           mtx(gko::batch::initialize<gko::batch::matrix::Ell<value_type>>(
               {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
                {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}},
-              exec)),
-          mvec(gko::batch::initialize<gko::batch::MultiVector<value_type>>(
-              {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
-               {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}},
-              exec)),
+              exec, 3)),
+          sp_mtx(gko::batch::initialize<gko::batch::matrix::Ell<value_type>>(
+              {{{-1.0, 0.0, 0.0}, {0.0, 2.5, 3.5}},
+               {{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}}},
+              exec, 2)),
           ell_mtx(gko::initialize<gko::matrix::Ell<value_type>>(
-              {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, exec))
+              {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, exec, gko::dim<2>(2, 3), 3)),
+          sp_ell_mtx(gko::initialize<gko::matrix::Ell<value_type>>(
+              {{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}}, exec, gko::dim<2>(2, 3), 2))
     {}
 
+    static void assert_equal_to_original_sparse_mtx(
+        const gko::batch::matrix::Ell<value_type>* m)
+    {
+        ASSERT_EQ(m->get_num_batch_items(), 2);
+        ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3));
+        ASSERT_EQ(m->get_num_stored_elements(), 2 * (2 * 2));
+        ASSERT_EQ(m->get_num_stored_elements_per_row(), 2);
+        EXPECT_EQ(m->get_const_values()[0], value_type{-1.0});
+        EXPECT_EQ(m->get_const_values()[1], value_type{2.5});
+        EXPECT_EQ(m->get_const_values()[2], value_type{0.0});
+        EXPECT_EQ(m->get_const_values()[3], value_type{3.5});
+        EXPECT_EQ(m->get_const_values()[4], value_type{1.0});
+        EXPECT_EQ(m->get_const_values()[5], value_type{2.0});
+        EXPECT_EQ(m->get_const_values()[6], value_type{0.0});
+        EXPECT_EQ(m->get_const_values()[7], value_type{3.0});
+        EXPECT_EQ(m->get_const_col_idxs()[0], index_type{0});
+        EXPECT_EQ(m->get_const_col_idxs()[1], index_type{1});
+        EXPECT_EQ(m->get_const_col_idxs()[2], index_type{-1});
+        ASSERT_EQ(m->get_const_col_idxs()[3], index_type{2});
+    }
 
     static void assert_equal_to_original_mtx(
-        gko::batch::matrix::Ell<value_type>* m)
+        const gko::batch::matrix::Ell<value_type>* m)
     {
         ASSERT_EQ(m->get_num_batch_items(), 2);
         ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3));
         ASSERT_EQ(m->get_num_stored_elements(), 2 * (2 * 3));
-        EXPECT_EQ(m->at(0, 0, 0), value_type{-1.0});
-        EXPECT_EQ(m->at(0, 0, 1), value_type{2.0});
-        EXPECT_EQ(m->at(0, 0, 2), value_type{3.0});
-        EXPECT_EQ(m->at(0, 1, 0), value_type{-1.5});
-        EXPECT_EQ(m->at(0, 1, 1), value_type{2.5});
-        ASSERT_EQ(m->at(0, 1, 2), value_type{3.5});
-        EXPECT_EQ(m->at(1, 0, 0), value_type{1.0});
-        EXPECT_EQ(m->at(1, 0, 1), value_type{2.5});
-        EXPECT_EQ(m->at(1, 0, 2), value_type{3.0});
-        EXPECT_EQ(m->at(1, 1, 0), value_type{1.0});
-        EXPECT_EQ(m->at(1, 1, 1), value_type{2.0});
-        ASSERT_EQ(m->at(1, 1, 2), value_type{3.0});
+        ASSERT_EQ(m->get_num_stored_elements_per_row(), 3);
+        EXPECT_EQ(m->get_const_values()[0], value_type{-1.0});
+        EXPECT_EQ(m->get_const_values()[1], value_type{-1.5});
+        EXPECT_EQ(m->get_const_values()[2], value_type{2.0});
+        EXPECT_EQ(m->get_const_values()[3], value_type{2.5});
+        EXPECT_EQ(m->get_const_values()[4], value_type{3.0});
+        EXPECT_EQ(m->get_const_values()[5], value_type{3.5});
+        EXPECT_EQ(m->get_const_values()[6], value_type{1.0});
+        EXPECT_EQ(m->get_const_values()[7], value_type{1.0});
+        EXPECT_EQ(m->get_const_values()[8], value_type{2.5});
+        EXPECT_EQ(m->get_const_values()[9], value_type{2.0});
+        EXPECT_EQ(m->get_const_values()[10], value_type{3.0});
+        ASSERT_EQ(m->get_const_values()[11], value_type{3.0});
     }
 
     static void assert_empty(gko::batch::matrix::Ell<value_type>* m)
     {
         ASSERT_EQ(m->get_num_batch_items(), 0);
         ASSERT_EQ(m->get_num_stored_elements(), 0);
+        ASSERT_EQ(m->get_num_stored_elements_per_row(), 0);
     }
 
     std::shared_ptr<const gko::Executor> exec;
     std::unique_ptr<gko::batch::matrix::Ell<value_type>> mtx;
-    std::unique_ptr<gko::batch::MultiVector<value_type>> mvec;
+    std::unique_ptr<gko::batch::matrix::Ell<value_type>> sp_mtx;
     std::unique_ptr<gko::matrix::Ell<value_type>> ell_mtx;
+    std::unique_ptr<gko::matrix::Ell<value_type>> sp_ell_mtx;
 };
 
 TYPED_TEST_SUITE(Ell, gko::test::ValueTypes);
@@ -109,6 +135,12 @@ TYPED_TEST(Ell, KnowsItsSizeAndValues)
 }
 
 
+TYPED_TEST(Ell, SparseMtxKnowsItsSizeAndValues)
+{
+    this->assert_equal_to_original_sparse_mtx(this->sp_mtx.get());
+}
+
+
 TYPED_TEST(Ell, CanBeEmpty)
 {
     auto empty = gko::batch::matrix::Ell<TypeParam>::create(this->exec);
@@ -137,10 +169,10 @@ TYPED_TEST(Ell, CanCreateEllItemView)
 }
 
 
-TYPED_TEST(Ell, CanCreateMultiVectorView)
+TYPED_TEST(Ell, CanCreateSpEllItemView)
 {
-    GKO_ASSERT_BATCH_MTX_NEAR(this->mtx->create_multi_vector_view(), this->mvec,
-                              0.0);
+    GKO_ASSERT_MTX_NEAR(this->sp_mtx->create_view_for_item(1), this->sp_ell_mtx,
+                        0.0);
 }
 
 
@@ -151,8 +183,7 @@ TYPED_TEST(Ell, CanBeCopied)
     mtx_copy->copy_from(this->mtx.get());
 
     this->assert_equal_to_original_mtx(this->mtx.get());
-    this->mtx->at(0, 0, 0) = 7;
-    this->mtx->at(0, 1) = 7;
+    this->mtx->get_values()[0] = 7;
     this->assert_equal_to_original_mtx(mtx_copy.get());
 }
 
@@ -189,71 +220,62 @@ TYPED_TEST(Ell, CanBeConstructedWithSize)
     using size_type = gko::size_type;
 
     auto m = gko::batch::matrix::Ell<TypeParam>::create(
-        this->exec, gko::batch_dim<2>(2, gko::dim<2>{5, 3}));
+        this->exec, gko::batch_dim<2>(2, gko::dim<2>{5, 3}), 2);
 
     ASSERT_EQ(m->get_num_batch_items(), 2);
     ASSERT_EQ(m->get_common_size(), gko::dim<2>(5, 3));
-    ASSERT_EQ(m->get_num_stored_elements(), 30);
+    ASSERT_EQ(m->get_num_stored_elements_per_row(), 2);
+    ASSERT_EQ(m->get_num_stored_elements(), 20);
 }
 
 
 TYPED_TEST(Ell, CanBeConstructedFromExistingData)
 {
     using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     using size_type = gko::size_type;
     // clang-format off
-    value_type data[] = {
+    value_type values[] = {
+       -1.0,  2.5,
+       0.0,  3.5,
        1.0,  2.0,
-      -1.0,  3.0,
-       4.0, -1.0,
-       3.0,  5.0,
-       1.0,  5.0,
-       6.0, -3.0};
+       0.0,  3.0};
+    index_type col_idxs[] = {
+       0,  1,
+      -1, 2};
     // clang-format on
 
     auto m = gko::batch::matrix::Ell<TypeParam>::create(
-        this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)),
-        gko::array<value_type>::view(this->exec, 8, data));
-
-    ASSERT_EQ(m->get_const_values(), data);
-    ASSERT_EQ(m->at(0, 0, 0), value_type{1.0});
-    ASSERT_EQ(m->at(0, 0, 1), value_type{2.0});
-    ASSERT_EQ(m->at(0, 1, 0), value_type{-1.0});
-    ASSERT_EQ(m->at(0, 1, 1), value_type{3.0});
-    ASSERT_EQ(m->at(1, 0, 0), value_type{4.0});
-    ASSERT_EQ(m->at(1, 0, 1), value_type{-1.0});
-    ASSERT_EQ(m->at(1, 1, 0), value_type{3.0});
-    ASSERT_EQ(m->at(1, 1, 1), value_type{5.0});
+        this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 3)), 2,
+        gko::array<value_type>::view(this->exec, 8, values),
+        gko::array<index_type>::view(this->exec, 4, col_idxs));
+
+    this->assert_equal_to_original_sparse_mtx(m.get());
 }
 
 
 TYPED_TEST(Ell, CanBeConstructedFromExistingConstData)
 {
     using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     using size_type = gko::size_type;
     // clang-format off
-    const value_type data[] = {
+    value_type values[] = {
+       -1.0,  2.5,
+       0.0,  3.5,
        1.0,  2.0,
-      -1.0,  3.0,
-       4.0, -1.0,
-       3.0,  5.0,
-       1.0,  5.0,
-       6.0, -3.0};
+       0.0,  3.0};
+    index_type col_idxs[] = {
+       0,  1,
+      -1, 2};
     // clang-format on
 
     auto m = gko::batch::matrix::Ell<TypeParam>::create_const(
-        this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)),
-        gko::array<value_type>::const_view(this->exec, 8, data));
-
-    ASSERT_EQ(m->get_const_values(), data);
-    ASSERT_EQ(m->at(0, 0, 0), value_type{1.0});
-    ASSERT_EQ(m->at(0, 0, 1), value_type{2.0});
-    ASSERT_EQ(m->at(0, 1, 0), value_type{-1.0});
-    ASSERT_EQ(m->at(0, 1, 1), value_type{3.0});
-    ASSERT_EQ(m->at(1, 0, 0), value_type{4.0});
-    ASSERT_EQ(m->at(1, 0, 1), value_type{-1.0});
-    ASSERT_EQ(m->at(1, 1, 0), value_type{3.0});
-    ASSERT_EQ(m->at(1, 1, 1), value_type{5.0});
+        this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 3)), 2,
+        gko::array<value_type>::const_view(this->exec, 8, values),
+        gko::array<index_type>::const_view(this->exec, 4, col_idxs));
+
+    this->assert_equal_to_original_sparse_mtx(m.get());
 }
 
 
@@ -263,35 +285,36 @@ TYPED_TEST(Ell, CanBeConstructedFromEllMatrices)
     using EllMtx = typename TestFixture::EllMtx;
     using size_type = gko::size_type;
 
-    auto mat1 = gko::initialize<EllMtx>({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
+    auto mat1 = gko::initialize<EllMtx>({{-1.0, 0.0, 0.0}, {0.0, 2.5, 3.5}},
                                         this->exec);
     auto mat2 =
-        gko::initialize<EllMtx>({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec);
+        gko::initialize<EllMtx>({{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}}, this->exec);
 
     auto m = gko::batch::create_from_item<gko::batch::matrix::Ell<value_type>>(
-        this->exec, std::vector<EllMtx*>{mat1.get(), mat2.get()});
+        this->exec, std::vector<EllMtx*>{mat1.get(), mat2.get()},
+        mat1->get_num_stored_elements_per_row());
 
-    this->assert_equal_to_original_mtx(m.get());
+    this->assert_equal_to_original_sparse_mtx(m.get());
 }
 
 
 TYPED_TEST(Ell, CanBeConstructedFromEllMatricesByDuplication)
 {
     using value_type = typename TestFixture::value_type;
+    using index_type = int;
     using EllMtx = typename TestFixture::EllMtx;
     using size_type = gko::size_type;
 
-    auto mat1 = gko::initialize<EllMtx>(4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
-                                        this->exec);
-    auto mat2 =
-        gko::initialize<EllMtx>({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec);
+    auto mat1 =
+        gko::initialize<EllMtx>({{1.0, 0.0, 0.0}, {0.0, 2.0, 0.0}}, this->exec);
 
     auto bat_m =
         gko::batch::create_from_item<gko::batch::matrix::Ell<value_type>>(
             this->exec,
-            std::vector<EllMtx*>{mat1.get(), mat1.get(), mat1.get()});
+            std::vector<EllMtx*>{mat1.get(), mat1.get(), mat1.get()},
+            mat1->get_num_stored_elements_per_row());
     auto m = gko::batch::create_from_item<gko::batch::matrix::Ell<value_type>>(
-        this->exec, 3, mat1.get());
+        this->exec, 3, mat1.get(), mat1->get_num_stored_elements_per_row());
 
     GKO_ASSERT_BATCH_MTX_NEAR(bat_m.get(), m.get(), 1e-14);
 }
@@ -300,24 +323,27 @@ TYPED_TEST(Ell, CanBeConstructedFromEllMatricesByDuplication)
 TYPED_TEST(Ell, CanBeConstructedByDuplicatingEllMatrices)
 {
     using value_type = typename TestFixture::value_type;
+    using index_type = int;
     using EllMtx = typename TestFixture::EllMtx;
     using size_type = gko::size_type;
 
-    auto mat1 = gko::initialize<EllMtx>({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
+    auto mat1 = gko::initialize<EllMtx>({{-1.0, 0.0, 0.0}, {0.0, 2.5, 0.0}},
                                         this->exec);
     auto mat2 =
-        gko::initialize<EllMtx>({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec);
+        gko::initialize<EllMtx>({{1.0, 0.0, 0.0}, {0.0, 2.0, 0.0}}, this->exec);
 
     auto m = gko::batch::create_from_item<gko::batch::matrix::Ell<value_type>>(
-        this->exec, std::vector<EllMtx*>{mat1.get(), mat2.get()});
+        this->exec, std::vector<EllMtx*>{mat1.get(), mat2.get()},
+        mat1->get_num_stored_elements_per_row());
     auto m_ref =
         gko::batch::create_from_item<gko::batch::matrix::Ell<value_type>>(
             this->exec,
             std::vector<EllMtx*>{mat1.get(), mat2.get(), mat1.get(), mat2.get(),
-                                 mat1.get(), mat2.get()});
+                                 mat1.get(), mat2.get()},
+            mat1->get_num_stored_elements_per_row());
 
     auto m2 = gko::batch::duplicate<gko::batch::matrix::Ell<value_type>>(
-        this->exec, 3, m.get());
+        this->exec, 3, m.get(), mat1->get_num_stored_elements_per_row());
 
     GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14);
 }
@@ -326,15 +352,16 @@ TYPED_TEST(Ell, CanBeConstructedByDuplicatingEllMatrices)
 TYPED_TEST(Ell, CanBeUnbatchedIntoEllMatrices)
 {
     using value_type = typename TestFixture::value_type;
+    using index_type = int;
     using EllMtx = typename TestFixture::EllMtx;
     using size_type = gko::size_type;
-    auto mat1 = gko::initialize<EllMtx>(4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
+    auto mat1 = gko::initialize<EllMtx>({{-1.0, 0.0, 0.0}, {0.0, 2.5, 3.5}},
                                         this->exec);
     auto mat2 =
-        gko::initialize<EllMtx>({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, this->exec);
+        gko::initialize<EllMtx>({{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}}, this->exec);
 
     auto ell_mats = gko::batch::unbatch<gko::batch::matrix::Ell<value_type>>(
-        this->mtx.get());
+        this->sp_mtx.get());
 
     GKO_ASSERT_MTX_NEAR(ell_mats[0].get(), mat1.get(), 0.);
     GKO_ASSERT_MTX_NEAR(ell_mats[1].get(), mat2.get(), 0.);
@@ -344,55 +371,83 @@ TYPED_TEST(Ell, CanBeUnbatchedIntoEllMatrices)
 TYPED_TEST(Ell, CanBeListConstructed)
 {
     using value_type = typename TestFixture::value_type;
+    using index_type = int;
     auto m = gko::batch::initialize<gko::batch::matrix::Ell<TypeParam>>(
-        {{1.0, 2.0}, {1.0, 3.0}}, this->exec);
+        {{0.0, -1.0}, {1.0, 0.0}}, this->exec);
 
     ASSERT_EQ(m->get_num_batch_items(), 2);
     ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1));
-    EXPECT_EQ(m->at(0, 0), value_type{1});
-    EXPECT_EQ(m->at(0, 1), value_type{2});
-    EXPECT_EQ(m->at(1, 0), value_type{1});
-    EXPECT_EQ(m->at(1, 1), value_type{3});
+    ASSERT_EQ(m->get_num_stored_elements(), 4);
+    ASSERT_EQ(m->get_num_stored_elements_per_row(), 1);
+    EXPECT_EQ(m->get_values()[0], value_type{0.0});
+    EXPECT_EQ(m->get_values()[1], value_type{-1.0});
+    EXPECT_EQ(m->get_values()[2], value_type{1.0});
+    EXPECT_EQ(m->get_values()[3], value_type{0.0});
+    EXPECT_EQ(m->get_col_idxs()[0], index_type{0});
+    EXPECT_EQ(m->get_col_idxs()[1], index_type{-1});
 }
 
 
 TYPED_TEST(Ell, CanBeListConstructedByCopies)
 {
     using value_type = typename TestFixture::value_type;
+    using index_type = int;
 
     auto m = gko::batch::initialize<gko::batch::matrix::Ell<TypeParam>>(
-        2, I<value_type>({1.0, 2.0}), this->exec);
+        2, I<value_type>({0.0, -1.0}), this->exec, 1);
 
     ASSERT_EQ(m->get_num_batch_items(), 2);
     ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1));
-    EXPECT_EQ(m->at(0, 0, 0), value_type{1.0});
-    EXPECT_EQ(m->at(0, 0, 1), value_type{2.0});
-    EXPECT_EQ(m->at(1, 0, 0), value_type{1.0});
-    EXPECT_EQ(m->at(1, 0, 1), value_type{2.0});
+    ASSERT_EQ(m->get_num_stored_elements(), 4);
+    ASSERT_EQ(m->get_num_stored_elements_per_row(), 1);
+    EXPECT_EQ(m->get_values()[0], value_type{0.0});
+    EXPECT_EQ(m->get_values()[1], value_type{-1.0});
+    EXPECT_EQ(m->get_values()[2], value_type{0.0});
+    EXPECT_EQ(m->get_values()[3], value_type{-1.0});
+    EXPECT_EQ(m->get_col_idxs()[0], index_type{-1});
+    EXPECT_EQ(m->get_col_idxs()[1], index_type{0});
 }
 
 
 TYPED_TEST(Ell, CanBeDoubleListConstructed)
 {
     using value_type = typename TestFixture::value_type;
+    using index_type = int;
     using T = value_type;
 
     auto m = gko::batch::initialize<gko::batch::matrix::Ell<TypeParam>>(
-        {{I<T>{1.0, 1.0, 0.0}, I<T>{2.0, 4.0, 3.0}, I<T>{3.0, 6.0, 1.0}},
-         {I<T>{1.0, 2.0, -1.0}, I<T>{3.0, 4.0, -2.0}, I<T>{5.0, 6.0, -3.0}}},
-        this->exec);
+        // clang-format off
+        {{I<T>{1.0, 0.0, 0.0},
+          I<T>{2.0, 0.0, 3.0},
+          I<T>{3.0, 6.0, 0.0}},
+         {I<T>{1.0, 0.0, 0.0},
+          I<T>{3.0, 0.0, -2.0},
+          I<T>{5.0, 8.0, 0.0}}},
+        // clang-format on
+        this->exec, 2);
 
+    ASSERT_EQ(m->get_num_batch_items(), 2);
     ASSERT_EQ(m->get_common_size(), gko::dim<2>(3, 3));
-    EXPECT_EQ(m->at(0, 0), value_type{1.0});
-    EXPECT_EQ(m->at(0, 1), value_type{1.0});
-    EXPECT_EQ(m->at(0, 2), value_type{0.0});
-    ASSERT_EQ(m->at(0, 3), value_type{2.0});
-    EXPECT_EQ(m->at(0, 4), value_type{4.0});
-    EXPECT_EQ(m->at(1, 0), value_type{1.0});
-    EXPECT_EQ(m->at(1, 1), value_type{2.0});
-    EXPECT_EQ(m->at(1, 2), value_type{-1.0});
-    ASSERT_EQ(m->at(1, 3), value_type{3.0});
-    EXPECT_EQ(m->at(1, 4), value_type{4.0});
+    ASSERT_EQ(m->get_num_stored_elements(), 2 * (2 * 3));
+    ASSERT_EQ(m->get_num_stored_elements_per_row(), 2);
+    EXPECT_EQ(m->get_values()[0], value_type{1.0});
+    EXPECT_EQ(m->get_values()[1], value_type{2.0});
+    EXPECT_EQ(m->get_values()[2], value_type{3.0});
+    EXPECT_EQ(m->get_values()[3], value_type{0.0});
+    EXPECT_EQ(m->get_values()[4], value_type{3.0});
+    EXPECT_EQ(m->get_values()[5], value_type{6.0});
+    EXPECT_EQ(m->get_values()[6], value_type{1.0});
+    EXPECT_EQ(m->get_values()[7], value_type{3.0});
+    EXPECT_EQ(m->get_values()[8], value_type{5.0});
+    EXPECT_EQ(m->get_values()[9], value_type{0.0});
+    EXPECT_EQ(m->get_values()[10], value_type{-2.0});
+    EXPECT_EQ(m->get_values()[11], value_type{8.0});
+    EXPECT_EQ(m->get_col_idxs()[0], index_type{0});
+    EXPECT_EQ(m->get_col_idxs()[1], index_type{0});
+    EXPECT_EQ(m->get_col_idxs()[2], index_type{0});
+    EXPECT_EQ(m->get_col_idxs()[3], index_type{-1});
+    EXPECT_EQ(m->get_col_idxs()[4], index_type{2});
+    EXPECT_EQ(m->get_col_idxs()[5], index_type{1});
 }
 
 
@@ -400,52 +455,17 @@ TYPED_TEST(Ell, CanBeReadFromMatrixData)
 {
     using value_type = typename TestFixture::value_type;
     using index_type = int;
-
     auto vec_data = std::vector<gko::matrix_data<value_type, index_type>>{};
     vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
-        {2, 2}, {{0, 0, 1.0}, {0, 1, 3.0}, {1, 0, 0.0}, {1, 1, 5.0}}));
+        {2, 3}, {{0, 0, -1.0}, {1, 1, 2.5}, {1, 2, 3.5}}));
     vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
-        {2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, {1, 0, 0.0}, {1, 1, 9.0}}));
+        {2, 3}, {{0, 0, 1.0}, {1, 1, 2.0}, {1, 2, 3.0}}));
 
     auto m = gko::batch::read<value_type, index_type,
                               gko::batch::matrix::Ell<value_type>>(this->exec,
-                                                                   vec_data);
-
-    ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2));
-    EXPECT_EQ(m->at(0, 0, 0), value_type{1.0});
-    EXPECT_EQ(m->at(0, 0, 1), value_type{3.0});
-    EXPECT_EQ(m->at(0, 1, 0), value_type{0.0});
-    EXPECT_EQ(m->at(0, 1, 1), value_type{5.0});
-    EXPECT_EQ(m->at(1, 0, 0), value_type{-1.0});
-    EXPECT_EQ(m->at(1, 0, 1), value_type{0.5});
-    EXPECT_EQ(m->at(1, 1, 0), value_type{0.0});
-    EXPECT_EQ(m->at(1, 1, 1), value_type{9.0});
-}
-
-
-TYPED_TEST(Ell, CanBeReadFromSparseMatrixData)
-{
-    using value_type = typename TestFixture::value_type;
-    using index_type = int;
-    auto vec_data = std::vector<gko::matrix_data<value_type, index_type>>{};
-    vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
-        {2, 2}, {{0, 0, 1.0}, {0, 1, 3.0}, {1, 1, 5.0}}));
-    vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
-        {2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, {1, 1, 9.0}}));
+                                                                   vec_data, 2);
 
-    auto m = gko::batch::read<value_type, index_type,
-                              gko::batch::matrix::Ell<value_type>>(this->exec,
-                                                                   vec_data);
-
-    ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2));
-    EXPECT_EQ(m->at(0, 0, 0), value_type{1.0});
-    EXPECT_EQ(m->at(0, 0, 1), value_type{3.0});
-    EXPECT_EQ(m->at(0, 1, 0), value_type{0.0});
-    EXPECT_EQ(m->at(0, 1, 1), value_type{5.0});
-    EXPECT_EQ(m->at(1, 0, 0), value_type{-1.0});
-    EXPECT_EQ(m->at(1, 0, 1), value_type{0.5});
-    EXPECT_EQ(m->at(1, 1, 0), value_type{0.0});
-    EXPECT_EQ(m->at(1, 1, 1), value_type{9.0});
+    this->assert_equal_to_original_sparse_mtx(m.get());
 }
 
 
@@ -455,24 +475,18 @@ TYPED_TEST(Ell, GeneratesCorrectMatrixData)
     using index_type = int;
     using tpl = typename gko::matrix_data<TypeParam>::nonzero_type;
 
-    auto data =
-        gko::batch::write<value_type, index_type,
-                          gko::batch::matrix::Ell<value_type>>(this->mtx.get());
+    auto data = gko::batch::write<value_type, index_type,
+                                  gko::batch::matrix::Ell<value_type>>(
+        this->sp_mtx.get());
 
     ASSERT_EQ(data[0].size, gko::dim<2>(2, 3));
-    ASSERT_EQ(data[0].nonzeros.size(), 6);
+    ASSERT_EQ(data[0].nonzeros.size(), 3);
     EXPECT_EQ(data[0].nonzeros[0], tpl(0, 0, value_type{-1.0}));
-    EXPECT_EQ(data[0].nonzeros[1], tpl(0, 1, value_type{2.0}));
-    EXPECT_EQ(data[0].nonzeros[2], tpl(0, 2, value_type{3.0}));
-    EXPECT_EQ(data[0].nonzeros[3], tpl(1, 0, value_type{-1.5}));
-    EXPECT_EQ(data[0].nonzeros[4], tpl(1, 1, value_type{2.5}));
-    EXPECT_EQ(data[0].nonzeros[5], tpl(1, 2, value_type{3.5}));
+    EXPECT_EQ(data[0].nonzeros[1], tpl(1, 1, value_type{2.5}));
+    EXPECT_EQ(data[0].nonzeros[2], tpl(1, 2, value_type{3.5}));
     ASSERT_EQ(data[1].size, gko::dim<2>(2, 3));
-    ASSERT_EQ(data[1].nonzeros.size(), 6);
+    ASSERT_EQ(data[1].nonzeros.size(), 3);
     EXPECT_EQ(data[1].nonzeros[0], tpl(0, 0, value_type{1.0}));
-    EXPECT_EQ(data[1].nonzeros[1], tpl(0, 1, value_type{2.5}));
-    EXPECT_EQ(data[1].nonzeros[2], tpl(0, 2, value_type{3.0}));
-    EXPECT_EQ(data[1].nonzeros[3], tpl(1, 0, value_type{1.0}));
-    EXPECT_EQ(data[1].nonzeros[4], tpl(1, 1, value_type{2.0}));
-    EXPECT_EQ(data[1].nonzeros[5], tpl(1, 2, value_type{3.0}));
+    EXPECT_EQ(data[1].nonzeros[1], tpl(1, 1, value_type{2.0}));
+    EXPECT_EQ(data[1].nonzeros[2], tpl(1, 2, value_type{3.0}));
 }
diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp
index 61dffba3193..45ba0686468 100644
--- a/include/ginkgo/core/base/batch_multi_vector.hpp
+++ b/include/ginkgo/core/base/batch_multi_vector.hpp
@@ -212,8 +212,8 @@ class MultiVector
      *       significantly more memory efficient than the non-constant version,
      *       so always prefer this version.
      */
-    const value_type* get_const_values_for_item(
-        size_type batch_id) const noexcept
+    const value_type* get_const_values_for_item(size_type batch_id) const
+        noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
         return values_.get_const_data() + this->get_cumulative_offset(batch_id);
@@ -462,224 +462,6 @@ class MultiVector
 };
 
 
-/**
- * Creates and initializes a batch of single column-vectors.
- *
- * This function first creates a temporary MultiVector, fills it with
- * passed in values, and then converts the vector to the requested type.
- *
- * @tparam Matrix  matrix type to initialize
- *                 (MultiVector has to implement the ConvertibleTo<Matrix>
- *                 interface)
- * @tparam TArgs  argument types for Matrix::create method
- *                (not including the implied Executor as the first argument)
- *
- * @param vals  values used to initialize the batch vector
- * @param exec  Executor associated to the vector
- * @param create_args  additional arguments passed to Matrix::create, not
- *                     including the Executor, which is passed as the first
- *                     argument
- *
- * @ingroup MultiVector
- * @ingroup mat_formats
- */
-template <typename Matrix, typename... TArgs>
-std::unique_ptr<Matrix> initialize(
-    std::initializer_list<std::initializer_list<typename Matrix::value_type>>
-        vals,
-    std::shared_ptr<const Executor> exec, TArgs&&... create_args)
-{
-    using batch_multi_vector = MultiVector<typename Matrix::value_type>;
-    size_type num_batch_items = vals.size();
-    GKO_THROW_IF_INVALID(num_batch_items > 0, "Input data is empty");
-    auto vals_begin = begin(vals);
-    size_type common_num_rows = vals_begin ? vals_begin->size() : 0;
-    auto common_size = dim<2>(common_num_rows, 1);
-    for (auto& val : vals) {
-        GKO_ASSERT_EQ(common_num_rows, val.size());
-    }
-    auto b_size = batch_dim<2>(num_batch_items, common_size);
-    auto tmp = batch_multi_vector::create(exec->get_master(), b_size);
-    size_type batch = 0;
-    for (const auto& b : vals) {
-        size_type idx = 0;
-        for (const auto& elem : b) {
-            tmp->at(batch, idx) = elem;
-            ++idx;
-        }
-        ++batch;
-    }
-    auto mtx = Matrix::create(exec, std::forward<TArgs>(create_args)...);
-    tmp->move_to(mtx);
-    return mtx;
-}
-
-
-/**
- * Creates and initializes a batch of multi-vectors.
- *
- * This function first creates a temporary MultiVector, fills it with
- * passed in values, and then converts the vector to the requested type.
- *
- * @tparam Matrix  matrix type to initialize
- *                 (Dense has to implement the ConvertibleTo<Matrix> interface)
- * @tparam TArgs  argument types for Matrix::create method
- *                (not including the implied Executor as the first argument)
- *
- * @param vals  values used to initialize the vector
- * @param exec  Executor associated to the vector
- * @param create_args  additional arguments passed to Matrix::create, not
- *                     including the Executor, which is passed as the first
- *                     argument
- *
- * @ingroup MultiVector
- * @ingroup mat_formats
- */
-template <typename Matrix, typename... TArgs>
-std::unique_ptr<Matrix> initialize(
-    std::initializer_list<std::initializer_list<
-        std::initializer_list<typename Matrix::value_type>>>
-        vals,
-    std::shared_ptr<const Executor> exec, TArgs&&... create_args)
-{
-    using batch_multi_vector = MultiVector<typename Matrix::value_type>;
-    size_type num_batch_items = vals.size();
-    GKO_THROW_IF_INVALID(num_batch_items > 0, "Input data is empty");
-    auto vals_begin = begin(vals);
-    size_type common_num_rows = vals_begin ? vals_begin->size() : 0;
-    size_type common_num_cols =
-        vals_begin->begin() ? vals_begin->begin()->size() : 0;
-    auto common_size = dim<2>(common_num_rows, common_num_cols);
-    for (const auto& b : vals) {
-        auto num_rows = b.size();
-        auto num_cols = begin(b)->size();
-        auto b_size = dim<2>(num_rows, num_cols);
-        GKO_ASSERT_EQUAL_DIMENSIONS(b_size, common_size);
-    }
-
-    auto b_size = batch_dim<2>(num_batch_items, common_size);
-    auto tmp = batch_multi_vector::create(exec->get_master(), b_size);
-    size_type batch = 0;
-    for (const auto& b : vals) {
-        size_type ridx = 0;
-        for (const auto& row : b) {
-            size_type cidx = 0;
-            for (const auto& elem : row) {
-                tmp->at(batch, ridx, cidx) = elem;
-                ++cidx;
-            }
-            ++ridx;
-        }
-        ++batch;
-    }
-    auto mtx = Matrix::create(exec, std::forward<TArgs>(create_args)...);
-    tmp->move_to(mtx);
-    return mtx;
-}
-
-
-/**
- * Creates and initializes a batch single column-vector by making copies of the
- * single input column vector.
- *
- * This function first creates a temporary batch multi-vector, fills it with
- * passed in values, and then converts the vector to the requested type.
- *
- * @tparam Matrix  matrix type to initialize
- *                 (MultiVector has to implement the ConvertibleTo<Matrix>
- *                  interface)
- * @tparam TArgs  argument types for Matrix::create method
- *                (not including the implied Executor as the first argument)
- *
- * @param num_vectors  The number of times the input vector is to be duplicated
- * @param vals  values used to initialize each vector in the temp. batch
- * @param exec  Executor associated to the vector
- * @param create_args  additional arguments passed to Matrix::create, not
- *                     including the Executor, which is passed as the first
- *                     argument
- *
- * @ingroup MultiVector
- * @ingroup mat_formats
- */
-template <typename Matrix, typename... TArgs>
-std::unique_ptr<Matrix> initialize(
-    const size_type num_vectors,
-    std::initializer_list<typename Matrix::value_type> vals,
-    std::shared_ptr<const Executor> exec, TArgs&&... create_args)
-{
-    using batch_multi_vector = MultiVector<typename Matrix::value_type>;
-    size_type num_batch_items = num_vectors;
-    GKO_THROW_IF_INVALID(num_batch_items > 0 && vals.size() > 0,
-                         "Input data is empty");
-    auto b_size =
-        batch_dim<2>(num_batch_items, dim<2>(begin(vals) ? vals.size() : 0, 1));
-    auto tmp = batch_multi_vector::create(exec->get_master(), b_size);
-    for (size_type batch = 0; batch < num_vectors; batch++) {
-        size_type idx = 0;
-        for (const auto& elem : vals) {
-            tmp->at(batch, idx) = elem;
-            ++idx;
-        }
-    }
-    auto mtx = Matrix::create(exec, std::forward<TArgs>(create_args)...);
-    tmp->move_to(mtx);
-    return mtx;
-}
-
-
-/**
- * Creates and initializes a matrix from copies of a given matrix.
- *
- * This function first creates a temporary batch multi-vector, fills it with
- * passed in values, and then converts the vector to the requested type.
- *
- * @tparam Matrix  matrix type to initialize
- *                 (MultiVector has to implement the ConvertibleTo<Matrix>
- *                  interface)
- * @tparam TArgs  argument types for Matrix::create method
- *                (not including the implied Executor as the first argument)
- *
- * @param num_batch_items The number of times the input matrix is duplicated
- * @param vals  values used to initialize each vector in the temp. batch
- * @param exec  Executor associated to the vector
- * @param create_args  additional arguments passed to Matrix::create, not
- *                     including the Executor, which is passed as the first
- *                     argument
- *
- * @ingroup LinOp
- * @ingroup mat_formats
- */
-template <typename Matrix, typename... TArgs>
-std::unique_ptr<Matrix> initialize(
-    const size_type num_batch_items,
-    std::initializer_list<std::initializer_list<typename Matrix::value_type>>
-        vals,
-    std::shared_ptr<const Executor> exec, TArgs&&... create_args)
-{
-    using batch_multi_vector = MultiVector<typename Matrix::value_type>;
-    GKO_THROW_IF_INVALID(num_batch_items > 0 && vals.size() > 0,
-                         "Input data is empty");
-    auto common_size = dim<2>(begin(vals) ? vals.size() : 0,
-                              begin(vals) ? begin(vals)->size() : 0);
-    batch_dim<2> b_size(num_batch_items, common_size);
-    auto tmp = batch_multi_vector::create(exec->get_master(), b_size);
-    for (size_type batch = 0; batch < num_batch_items; batch++) {
-        size_type ridx = 0;
-        for (const auto& row : vals) {
-            size_type cidx = 0;
-            for (const auto& elem : row) {
-                tmp->at(batch, ridx, cidx) = elem;
-                ++cidx;
-            }
-            ++ridx;
-        }
-    }
-    auto mtx = Matrix::create(exec, std::forward<TArgs>(create_args)...);
-    tmp->move_to(mtx);
-    return mtx;
-}
-
-
 }  // namespace batch
 }  // namespace gko
 
diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp
index af77fc1e390..490f7a7d4b0 100644
--- a/include/ginkgo/core/matrix/batch_ell.hpp
+++ b/include/ginkgo/core/matrix/batch_ell.hpp
@@ -198,8 +198,8 @@ class Ell final
     }
 
     /**
-     * Returns a pointer to the array of col_idxs of the matrix for a
-     * specific batch item.
+     * Returns a pointer to the array of col_idxs of the matrix. This is shared
+     * across all batch items.
      *
      * @param batch_id  the id of the batch item.
      *
@@ -208,8 +208,7 @@ class Ell final
     index_type* get_col_idxs_for_item(size_type batch_id) noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
-        return col_idxs_.get_data() +
-               batch_id * this->get_num_elements_per_item();
+        return col_idxs_.get_data();
     }
 
     /**
@@ -223,8 +222,7 @@ class Ell final
         noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
-        return col_idxs_.get_const_data() +
-               batch_id * this->get_num_elements_per_item();
+        return col_idxs_.get_const_data();
     }
 
     /**
@@ -312,7 +310,8 @@ class Ell final
     size_type compute_num_elems(const batch_dim<2>& size,
                                 IndexType num_elems_per_row)
     {
-        return size.get_common_size()[0] * num_elems_per_row;
+        return size.get_num_batch_items() * size.get_common_size()[0] *
+               num_elems_per_row;
     }
 
 
@@ -356,8 +355,9 @@ class Ell final
         // Ensure that the value and col_idxs arrays have the correct size
         auto num_elems = this->get_common_size()[0] * num_elems_per_row *
                          this->get_num_batch_items();
-        GKO_ENSURE_IN_BOUNDS(num_elems, values_.get_num_elems() + 1);
-        GKO_ENSURE_IN_BOUNDS(num_elems, col_idxs_.get_num_elems() + 1);
+        GKO_ASSERT_EQ(num_elems, values_.get_num_elems());
+        GKO_ASSERT_EQ(this->get_num_elements_per_item(),
+                      col_idxs_.get_num_elems());
     }
 
     /**

From c5a14c0dae8a4961ebcb70fab82c52754af39548 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Sun, 8 Oct 2023 10:26:50 +0200
Subject: [PATCH 05/18] Add OMP, CUDA, HIP kernels and tests

Co-authored-by: Aditya Kashi <kashia@ornl.gov>
---
 .../matrix/batch_ell_kernel_launcher.hpp.inc  |  29 +-
 .../cuda_hip/matrix/batch_ell_kernels.hpp.inc | 155 +++++++++++
 core/matrix/batch_struct.hpp                  |   5 +-
 cuda/matrix/batch_ell_kernels.cu              |   2 +-
 cuda/matrix/batch_struct.hpp                  |  34 +++
 hip/matrix/batch_dense_kernels.hip.cpp        |   1 -
 hip/matrix/batch_ell_kernels.hip.cpp          |  27 +-
 hip/matrix/batch_struct.hip.hpp               |  34 +++
 reference/matrix/batch_ell_kernels.hpp.inc    |   6 +-
 reference/matrix/batch_struct.hpp             |   4 +-
 reference/test/matrix/CMakeLists.txt          |   1 +
 reference/test/matrix/batch_ell_kernels.cpp   | 248 ++++++++++++++++++
 test/matrix/CMakeLists.txt                    |   1 +
 test/matrix/batch_ell_kernels.cpp             | 128 +++++++++
 14 files changed, 650 insertions(+), 25 deletions(-)
 create mode 100644 common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
 create mode 100644 reference/test/matrix/batch_ell_kernels.cpp
 create mode 100644 test/matrix/batch_ell_kernels.cpp

diff --git a/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc
index 263e911c31a..f8da432aa4d 100644
--- a/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc
+++ b/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc
@@ -34,7 +34,18 @@ template <typename ValueType, typename IndexType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Ell<ValueType, IndexType>* mat,
                   const batch::MultiVector<ValueType>* b,
-                  batch::MultiVector<ValueType>* x) GKO_NOT_IMPLEMENTED;
+                  batch::MultiVector<ValueType>* x)
+{
+    const auto num_blocks = mat->get_num_batch_items();
+    const auto b_ub = get_batch_struct(b);
+    const auto x_ub = get_batch_struct(x);
+    const auto mat_ub = get_batch_struct(mat);
+    if (b->get_common_size()[1] > 1) {
+        GKO_NOT_IMPLEMENTED;
+    }
+    simple_apply_kernel<<<num_blocks, default_block_size, 0,
+                          exec->get_stream()>>>(mat_ub, b_ub, x_ub);
+}
 
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
@@ -47,7 +58,21 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
                     const batch::matrix::Ell<ValueType, IndexType>* mat,
                     const batch::MultiVector<ValueType>* b,
                     const batch::MultiVector<ValueType>* beta,
-                    batch::MultiVector<ValueType>* x) GKO_NOT_IMPLEMENTED;
+                    batch::MultiVector<ValueType>* x)
+{
+    const auto num_blocks = mat->get_num_batch_items();
+    const auto b_ub = get_batch_struct(b);
+    const auto x_ub = get_batch_struct(x);
+    const auto mat_ub = get_batch_struct(mat);
+    const auto alpha_ub = get_batch_struct(alpha);
+    const auto beta_ub = get_batch_struct(beta);
+    if (b->get_common_size()[1] > 1) {
+        GKO_NOT_IMPLEMENTED;
+    }
+    advanced_apply_kernel<<<num_blocks, default_block_size, 0,
+                            exec->get_stream()>>>(alpha_ub, mat_ub, b_ub,
+                                                  beta_ub, x_ub);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
new file mode 100644
index 00000000000..e55e7a60471
--- /dev/null
+++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
@@ -0,0 +1,155 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+template <typename ValueType>
+__device__ __forceinline__ void simple_apply(
+    const gko::batch::matrix::batch_ell::batch_item<const ValueType>& mat,
+    const ValueType* const __restrict__ b, ValueType* const __restrict__ x)
+{
+    const auto num_rows = mat.num_rows;
+    const auto num_stored_elements_per_row = mat.num_stored_elems_per_row;
+    const auto stride = mat.stride;
+    const auto val = mat.values;
+    const auto col = mat.col_idxs;
+    for (int tidx = threadIdx.x; tidx < num_rows; tidx += blockDim.x) {
+        auto temp = zero<ValueType>();
+        for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
+            const auto ind = tidx + idx * stride;
+            const auto col_idx = col[ind];
+            if (col_idx < idx) {
+                break;
+            } else {
+                temp += val[ind] * b[col_idx];
+            }
+        }
+        x[tidx] = temp;
+    }
+}
+
+template <typename ValueType>
+__global__ __launch_bounds__(
+    default_block_size,
+    sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix::
+                                                      batch_ell::uniform_batch<
+                                                          const ValueType>
+                                                          mat,
+                                                  const gko::batch::
+                                                      multi_vector::
+                                                          uniform_batch<
+                                                              const ValueType>
+                                                              b,
+                                                  const gko::batch::
+                                                      multi_vector::
+                                                          uniform_batch<
+                                                              ValueType>
+                                                              x)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto mat_b =
+            gko::batch::matrix::extract_batch_item(mat, batch_id);
+        const auto b_b = gko::batch::extract_batch_item(b, batch_id);
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        simple_apply(mat_b, b_b.values, x_b.values);
+    }
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ void advanced_apply(
+    const ValueType alpha,
+    const gko::batch::matrix::batch_ell::batch_item<const ValueType>& mat,
+    const ValueType* const __restrict__ b, const ValueType beta,
+    ValueType* const __restrict__ x)
+{
+    const auto num_rows = mat.num_rows;
+    const auto num_stored_elements_per_row = mat.num_stored_elems_per_row;
+    const auto stride = mat.stride;
+    const auto val = mat.values;
+    const auto col = mat.col_idxs;
+    for (int tidx = threadIdx.x; tidx < num_rows; tidx += blockDim.x) {
+        auto temp = zero<ValueType>();
+        for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
+            const auto ind = tidx + idx * stride;
+            const auto col_idx = col[ind];
+            if (col_idx < idx) {
+                break;
+            } else {
+                temp += alpha * val[ind] * b[col_idx];
+            }
+        }
+        x[tidx] = temp + beta * x[tidx];
+    }
+}
+
+template <typename ValueType>
+__global__ __launch_bounds__(
+    default_block_size,
+    sm_oversubscription) void advanced_apply_kernel(const gko::batch::
+                                                        multi_vector::
+                                                            uniform_batch<
+                                                                const ValueType>
+                                                                alpha,
+                                                    const gko::batch::matrix::
+                                                        batch_ell::
+                                                            uniform_batch<
+                                                                const ValueType>
+                                                                mat,
+                                                    const gko::batch::
+                                                        multi_vector::
+                                                            uniform_batch<
+                                                                const ValueType>
+                                                                b,
+                                                    const gko::batch::
+                                                        multi_vector::
+                                                            uniform_batch<
+                                                                const ValueType>
+                                                                beta,
+                                                    const gko::batch::
+                                                        multi_vector::
+                                                            uniform_batch<
+                                                                ValueType>
+                                                                x)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto mat_b =
+            gko::batch::matrix::extract_batch_item(mat, batch_id);
+        const auto b_b = gko::batch::extract_batch_item(b, batch_id);
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
+        const auto beta_b = gko::batch::extract_batch_item(beta, batch_id);
+        advanced_apply(alpha_b.values[0], mat_b, b_b.values, beta_b.values[0],
+                       x_b.values);
+    }
+}
diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp
index 272bb506df2..2eed40882bc 100644
--- a/core/matrix/batch_struct.hpp
+++ b/core/matrix/batch_struct.hpp
@@ -188,8 +188,7 @@ GKO_ATTRIBUTES GKO_INLINE batch_ell::batch_item<ValueType> extract_batch_item(
 {
     return {batch.values +
                 batch_idx * batch.num_stored_elems_per_row * batch.num_rows,
-            batch.col_idxs +
-                batch_idx * batch.num_stored_elems_per_row * batch.num_rows,
+            batch.col_idxs,
             batch.stride,
             batch.num_rows,
             batch.num_cols,
@@ -203,7 +202,7 @@ GKO_ATTRIBUTES GKO_INLINE batch_ell::batch_item<ValueType> extract_batch_item(
     const size_type batch_idx)
 {
     return {batch_values + batch_idx * num_elems_per_row * num_rows,
-            batch_col_idxs + batch_idx * num_elems_per_row * num_rows,
+            batch_col_idxs,
             stride,
             num_rows,
             num_cols,
diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu
index 567d863d95c..ee6a99f04ca 100644
--- a/cuda/matrix/batch_ell_kernels.cu
+++ b/cuda/matrix/batch_ell_kernels.cu
@@ -72,7 +72,7 @@ constexpr int sm_oversubscription = 4;
 
 // NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
 
-// #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
 
 
 #include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp
index 73712a7b81b..7a6a4ac7f00 100644
--- a/cuda/matrix/batch_struct.hpp
+++ b/cuda/matrix/batch_struct.hpp
@@ -87,6 +87,40 @@ get_batch_struct(batch::matrix::Dense<ValueType>* const op)
 }
 
 
+/**
+ * Generates an immutable uniform batch struct from a batch of ell matrices.
+ */
+template <typename ValueType>
+inline batch::matrix::batch_ell::uniform_batch<const cuda_type<ValueType>>
+get_batch_struct(const batch::matrix::Ell<ValueType, int32>* const op)
+{
+    return {as_cuda_type(op->get_const_values()),
+            op->get_const_col_idxs(),
+            op->get_num_batch_items(),
+            static_cast<int>(op->get_common_size()[0]),
+            static_cast<int>(op->get_common_size()[0]),
+            static_cast<int>(op->get_common_size()[1]),
+            static_cast<int>(op->get_num_stored_elements_per_row())};
+}
+
+
+/**
+ * Generates a uniform batch struct from a batch of ell matrices.
+ */
+template <typename ValueType>
+inline batch::matrix::batch_ell::uniform_batch<cuda_type<ValueType>>
+get_batch_struct(batch::matrix::Ell<ValueType, int32>* const op)
+{
+    return {as_cuda_type(op->get_values()),
+            op->get_col_idxs(),
+            op->get_num_batch_items(),
+            static_cast<int>(op->get_common_size()[0]),
+            static_cast<int>(op->get_common_size()[0]),
+            static_cast<int>(op->get_common_size()[1]),
+            static_cast<int>(op->get_num_stored_elements_per_row())};
+}
+
+
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp
index eb3da83760a..3361feeb8b8 100644
--- a/hip/matrix/batch_dense_kernels.hip.cpp
+++ b/hip/matrix/batch_dense_kernels.hip.cpp
@@ -38,7 +38,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/range_accessors.hpp>
 
 
 #include "core/base/batch_struct.hpp"
diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp
index 567d863d95c..fdd52c38f57 100644
--- a/hip/matrix/batch_ell_kernels.hip.cpp
+++ b/hip/matrix/batch_ell_kernels.hip.cpp
@@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/batch_ell_kernels.hpp"
 
 
+#include <hip/hip_runtime.h>
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
@@ -42,21 +43,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cublas_bindings.hpp"
-#include "cuda/base/pointer_mode_guard.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-#include "cuda/matrix/batch_struct.hpp"
+#include "hip/base/batch_struct.hip.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/hipblas_bindings.hip.hpp"
+#include "hip/base/pointer_mode_guard.hip.hpp"
+#include "hip/base/thrust.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
+#include "hip/matrix/batch_struct.hip.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace cuda {
+namespace hip {
 /**
  * @brief The Ell matrix format namespace.
  * @ref Ell
@@ -72,7 +73,7 @@ constexpr int sm_oversubscription = 4;
 
 // NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
 
-// #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
 
 
 #include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
@@ -81,6 +82,6 @@ constexpr int sm_oversubscription = 4;
 
 
 }  // namespace batch_ell
-}  // namespace cuda
+}  // namespace hip
 }  // namespace kernels
 }  // namespace gko
diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp
index 4670cf0988b..a43d7d058b0 100644
--- a/hip/matrix/batch_struct.hip.hpp
+++ b/hip/matrix/batch_struct.hip.hpp
@@ -87,6 +87,40 @@ get_batch_struct(batch::matrix::Dense<ValueType>* const op)
 }
 
 
+/**
+ * Generates an immutable uniform batch struct from a batch of ell matrices.
+ */
+template <typename ValueType>
+inline batch::matrix::batch_ell::uniform_batch<const hip_type<ValueType>>
+get_batch_struct(const batch::matrix::Ell<ValueType, int32>* const op)
+{
+    return {as_hip_type(op->get_const_values()),
+            op->get_const_col_idxs(),
+            op->get_num_batch_items(),
+            static_cast<int>(op->get_common_size()[0]),
+            static_cast<int>(op->get_common_size()[0]),
+            static_cast<int>(op->get_common_size()[1]),
+            static_cast<int>(op->get_num_stored_elements_per_row())};
+}
+
+
+/**
+ * Generates a uniform batch struct from a batch of ell matrices.
+ */
+template <typename ValueType>
+inline batch::matrix::batch_ell::uniform_batch<hip_type<ValueType>>
+get_batch_struct(batch::matrix::Ell<ValueType, int32>* const op)
+{
+    return {as_hip_type(op->get_values()),
+            op->get_col_idxs(),
+            op->get_num_batch_items(),
+            static_cast<int>(op->get_common_size()[0]),
+            static_cast<int>(op->get_common_size()[0]),
+            static_cast<int>(op->get_common_size()[1]),
+            static_cast<int>(op->get_num_stored_elements_per_row())};
+}
+
+
 }  // namespace hip
 }  // namespace kernels
 }  // namespace gko
diff --git a/reference/matrix/batch_ell_kernels.hpp.inc b/reference/matrix/batch_ell_kernels.hpp.inc
index 37370261d44..41d0a00ddcd 100644
--- a/reference/matrix/batch_ell_kernels.hpp.inc
+++ b/reference/matrix/batch_ell_kernels.hpp.inc
@@ -36,14 +36,14 @@ inline void simple_apply_kernel(
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
     const gko::batch::multi_vector::batch_item<ValueType>& c)
 {
-    for (int row = 0; row < a.num_rows; ++row) {
-        for (int j = 0; j < b.num_rhs; ++j) {
+    for (int row = 0; row < c.num_rows; ++row) {
+        for (int j = 0; j < c.num_rhs; ++j) {
             c.values[row * c.stride + j] = zero<ValueType>();
         }
         for (auto k = 0; k < a.num_stored_elems_per_row; ++k) {
             auto val = a.values[row + k * a.stride];
             auto col = a.col_idxs[row + k * a.stride];
-            for (int j = 0; j < b.num_rhs; ++j) {
+            for (int j = 0; j < c.num_rhs; ++j) {
                 c.values[row * c.stride + j] +=
                     val * b.values[col * b.stride + j];
             }
diff --git a/reference/matrix/batch_struct.hpp b/reference/matrix/batch_struct.hpp
index b5eacd80d18..3b562450ee0 100644
--- a/reference/matrix/batch_struct.hpp
+++ b/reference/matrix/batch_struct.hpp
@@ -101,7 +101,7 @@ get_batch_struct(const batch::matrix::Ell<ValueType, int32>* const op)
     return {op->get_const_values(),
             op->get_const_col_idxs(),
             op->get_num_batch_items(),
-            static_cast<int>(op->get_common_size()[1]),
+            static_cast<int>(op->get_common_size()[0]),
             static_cast<int>(op->get_common_size()[0]),
             static_cast<int>(op->get_common_size()[1]),
             static_cast<int>(op->get_num_stored_elements_per_row())};
@@ -118,7 +118,7 @@ inline batch::matrix::batch_ell::uniform_batch<ValueType> get_batch_struct(
     return {op->get_values(),
             op->get_col_idxs(),
             op->get_num_batch_items(),
-            static_cast<int>(op->get_common_size()[1]),
+            static_cast<int>(op->get_common_size()[0]),
             static_cast<int>(op->get_common_size()[0]),
             static_cast<int>(op->get_common_size()[1]),
             static_cast<int>(op->get_num_stored_elements_per_row())};
diff --git a/reference/test/matrix/CMakeLists.txt b/reference/test/matrix/CMakeLists.txt
index 18634de662d..05498cbadc4 100644
--- a/reference/test/matrix/CMakeLists.txt
+++ b/reference/test/matrix/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_test(batch_dense_kernels)
+ginkgo_create_test(batch_ell_kernels)
 ginkgo_create_test(coo_kernels)
 ginkgo_create_test(csr_kernels)
 ginkgo_create_test(dense_kernels)
diff --git a/reference/test/matrix/batch_ell_kernels.cpp b/reference/test/matrix/batch_ell_kernels.cpp
new file mode 100644
index 00000000000..76b681c69f7
--- /dev/null
+++ b/reference/test/matrix/batch_ell_kernels.cpp
@@ -0,0 +1,248 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include <complex>
+#include <memory>
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+
+
+#include "core/matrix/batch_ell_kernels.hpp"
+#include "core/test/utils.hpp"
+
+
+template <typename T>
+class Ell : public ::testing::Test {
+protected:
+    using value_type = T;
+    using size_type = gko::size_type;
+    using Mtx = gko::batch::matrix::Ell<value_type>;
+    using MVec = gko::batch::MultiVector<value_type>;
+    using EllMtx = gko::matrix::Ell<value_type>;
+    using DenseMtx = gko::matrix::Dense<value_type>;
+    using ComplexMtx = gko::to_complex<Mtx>;
+    using RealMtx = gko::remove_complex<Mtx>;
+    Ell()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx_0(gko::batch::initialize<Mtx>(
+              {{I<T>({1.0, -1.0, 1.5}), I<T>({-2.0, 2.0, 3.0})},
+               {{1.0, -2.0, -0.5}, {1.0, -2.5, 4.0}}},
+              exec)),
+          mtx_00(gko::initialize<EllMtx>(
+              {I<T>({1.0, -1.0, 1.5}), I<T>({-2.0, 2.0, 3.0})}, exec)),
+          mtx_01(gko::initialize<EllMtx>(
+              {I<T>({1.0, -2.0, -0.5}), I<T>({1.0, -2.5, 4.0})}, exec)),
+          b_0(gko::batch::initialize<MVec>(
+              {{I<T>({1.0, 0.0, 1.0}), I<T>({2.0, 0.0, 1.0}),
+                I<T>({1.0, 0.0, 2.0})},
+               {I<T>({-1.0, 1.0, 1.0}), I<T>({1.0, -1.0, 1.0}),
+                I<T>({1.0, 0.0, 2.0})}},
+              exec)),
+          b_00(gko::initialize<DenseMtx>(
+              {I<T>({1.0, 0.0, 1.0}), I<T>({2.0, 0.0, 1.0}),
+               I<T>({1.0, 0.0, 2.0})},
+              exec)),
+          b_01(gko::initialize<DenseMtx>(
+              {I<T>({-1.0, 1.0, 1.0}), I<T>({1.0, -1.0, 1.0}),
+               I<T>({1.0, 0.0, 2.0})},
+              exec)),
+          x_0(gko::batch::initialize<MVec>(
+              {{I<T>({2.0, 0.0, 1.0}), I<T>({2.0, 0.0, 2.0})},
+               {I<T>({-2.0, 1.0, 1.0}), I<T>({1.0, -1.0, -1.0})}},
+              exec)),
+          x_00(gko::initialize<DenseMtx>(
+              {I<T>({2.0, 0.0, 1.0}), I<T>({2.0, 0.0, 2.0})}, exec)),
+          x_01(gko::initialize<DenseMtx>(
+              {I<T>({-2.0, 1.0, 1.0}), I<T>({1.0, -1.0, -1.0})}, exec))
+    {}
+
+    std::shared_ptr<const gko::ReferenceExecutor> exec;
+    std::unique_ptr<Mtx> mtx_0;
+    std::unique_ptr<EllMtx> mtx_00;
+    std::unique_ptr<EllMtx> mtx_01;
+    std::unique_ptr<MVec> b_0;
+    std::unique_ptr<DenseMtx> b_00;
+    std::unique_ptr<DenseMtx> b_01;
+    std::unique_ptr<MVec> x_0;
+    std::unique_ptr<DenseMtx> x_00;
+    std::unique_ptr<DenseMtx> x_01;
+
+    std::ranlux48 rand_engine;
+};
+
+
+TYPED_TEST_SUITE(Ell, gko::test::ValueTypes);
+
+
+TYPED_TEST(Ell, AppliesToBatchMultiVector)
+{
+    using T = typename TestFixture::value_type;
+
+    this->mtx_0->apply(this->b_0.get(), this->x_0.get());
+    this->mtx_00->apply(this->b_00.get(), this->x_00.get());
+    this->mtx_01->apply(this->b_01.get(), this->x_01.get());
+
+    auto res = gko::batch::unbatch<gko::batch::MultiVector<T>>(this->x_0.get());
+
+    GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.);
+    GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.);
+}
+
+
+TYPED_TEST(Ell, AppliesLinearCombinationWithSameAlphaToBatchMultiVector)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using MVec = typename TestFixture::MVec;
+    using DenseMtx = typename TestFixture::DenseMtx;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::batch::initialize<MVec>(2, {1.5}, this->exec);
+    auto beta = gko::batch::initialize<MVec>(2, {-4.0}, this->exec);
+    auto alpha0 = gko::initialize<DenseMtx>({1.5}, this->exec);
+    auto alpha1 = gko::initialize<DenseMtx>({1.5}, this->exec);
+    auto beta0 = gko::initialize<DenseMtx>({-4.0}, this->exec);
+    auto beta1 = gko::initialize<DenseMtx>({-4.0}, this->exec);
+
+    this->mtx_0->apply(alpha.get(), this->b_0.get(), beta.get(),
+                       this->x_0.get());
+    this->mtx_00->apply(alpha0.get(), this->b_00.get(), beta0.get(),
+                        this->x_00.get());
+    this->mtx_01->apply(alpha1.get(), this->b_01.get(), beta1.get(),
+                        this->x_01.get());
+
+    auto res = gko::batch::unbatch<gko::batch::MultiVector<T>>(this->x_0.get());
+
+    GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.);
+    GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.);
+}
+
+
+TYPED_TEST(Ell, AppliesLinearCombinationToBatchMultiVector)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using MVec = typename TestFixture::MVec;
+    using DenseMtx = typename TestFixture::DenseMtx;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::batch::initialize<MVec>({{1.5}, {-1.0}}, this->exec);
+    auto beta = gko::batch::initialize<MVec>({{2.5}, {-4.0}}, this->exec);
+    auto alpha0 = gko::initialize<DenseMtx>({1.5}, this->exec);
+    auto alpha1 = gko::initialize<DenseMtx>({-1.0}, this->exec);
+    auto beta0 = gko::initialize<DenseMtx>({2.5}, this->exec);
+    auto beta1 = gko::initialize<DenseMtx>({-4.0}, this->exec);
+
+    this->mtx_0->apply(alpha.get(), this->b_0.get(), beta.get(),
+                       this->x_0.get());
+    this->mtx_00->apply(alpha0.get(), this->b_00.get(), beta0.get(),
+                        this->x_00.get());
+    this->mtx_01->apply(alpha1.get(), this->b_01.get(), beta1.get(),
+                        this->x_01.get());
+
+    auto res = gko::batch::unbatch<gko::batch::MultiVector<T>>(this->x_0.get());
+
+    GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.);
+    GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.);
+}
+
+
+TYPED_TEST(Ell, ApplyFailsOnWrongNumberOfResultCols)
+{
+    using MVec = typename TestFixture::MVec;
+    auto res = MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2}});
+
+    ASSERT_THROW(this->mtx_0->apply(this->b_0.get(), res.get()),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(Ell, ApplyFailsOnWrongNumberOfResultRows)
+{
+    using MVec = typename TestFixture::MVec;
+    auto res = MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3}});
+
+    ASSERT_THROW(this->mtx_0->apply(this->b_0.get(), res.get()),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(Ell, ApplyFailsOnWrongInnerDimension)
+{
+    using MVec = typename TestFixture::MVec;
+    auto res =
+        MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}});
+
+    ASSERT_THROW(this->mtx_0->apply(res.get(), this->x_0.get()),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(Ell, AdvancedApplyFailsOnWrongInnerDimension)
+{
+    using MVec = typename TestFixture::MVec;
+    auto res =
+        MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}});
+    auto alpha =
+        MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}});
+    auto beta =
+        MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}});
+
+    ASSERT_THROW(
+        this->mtx_0->apply(alpha.get(), res.get(), beta.get(), this->x_0.get()),
+        gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(Ell, AdvancedApplyFailsOnWrongAlphaDimension)
+{
+    using MVec = typename TestFixture::MVec;
+    auto res =
+        MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3, 3}});
+    auto alpha =
+        MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 1}});
+    auto beta =
+        MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}});
+
+    ASSERT_THROW(
+        this->mtx_0->apply(alpha.get(), res.get(), beta.get(), this->x_0.get()),
+        gko::DimensionMismatch);
+}
diff --git a/test/matrix/CMakeLists.txt b/test/matrix/CMakeLists.txt
index 9f3b17cd858..f1c91e615e7 100644
--- a/test/matrix/CMakeLists.txt
+++ b/test/matrix/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_common_test(batch_dense_kernels)
+ginkgo_create_common_test(batch_ell_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_device_test(csr_kernels)
 ginkgo_create_common_test(csr_kernels2)
 ginkgo_create_common_test(coo_kernels)
diff --git a/test/matrix/batch_ell_kernels.cpp b/test/matrix/batch_ell_kernels.cpp
new file mode 100644
index 00000000000..9629a2263ff
--- /dev/null
+++ b/test/matrix/batch_ell_kernels.cpp
@@ -0,0 +1,128 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/batch_ell_kernels.hpp"
+
+
+#include <memory>
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include "core/base/batch_utilities.hpp"
+#include "core/test/utils.hpp"
+#include "core/test/utils/assertions.hpp"
+#include "core/test/utils/batch_helpers.hpp"
+#include "test/utils/executor.hpp"
+
+
+class Ell : public CommonTestFixture {
+protected:
+    using Mtx = gko::batch::matrix::Ell<value_type>;
+    using MVec = gko::batch::MultiVector<value_type>;
+
+    Ell() : rand_engine(15) {}
+
+    template <typename MtxType>
+    std::unique_ptr<MtxType> gen_mtx(const gko::size_type num_batch_items,
+                                     gko::size_type num_rows,
+                                     gko::size_type num_cols)
+    {
+        return gko::test::generate_random_batch_matrix<MtxType>(
+            num_batch_items, num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    void set_up_apply_data(gko::size_type num_vecs = 1)
+    {
+        const int num_rows = 252;
+        const int num_cols = 32;
+        x = gen_mtx<Mtx>(batch_size, num_rows, num_cols);
+        y = gen_mtx<MVec>(batch_size, num_cols, num_vecs);
+        alpha = gen_mtx<MVec>(batch_size, 1, 1);
+        beta = gen_mtx<MVec>(batch_size, 1, 1);
+        dx = gko::clone(exec, x);
+        dy = gko::clone(exec, y);
+        dalpha = gko::clone(exec, alpha);
+        dbeta = gko::clone(exec, beta);
+        expected = MVec::create(
+            ref,
+            gko::batch_dim<2>(batch_size, gko::dim<2>{num_rows, num_vecs}));
+        expected->fill(gko::one<value_type>());
+        dresult = gko::clone(exec, expected);
+    }
+
+    std::ranlux48 rand_engine;
+
+    const size_t batch_size = 11;
+    std::unique_ptr<Mtx> x;
+    std::unique_ptr<MVec> y;
+    std::unique_ptr<MVec> alpha;
+    std::unique_ptr<MVec> beta;
+    std::unique_ptr<MVec> expected;
+    std::unique_ptr<MVec> dresult;
+    std::unique_ptr<Mtx> dx;
+    std::unique_ptr<MVec> dy;
+    std::unique_ptr<MVec> dalpha;
+    std::unique_ptr<MVec> dbeta;
+};
+
+
+TEST_F(Ell, SingleVectorApplyIsEquivalentToRef)
+{
+    set_up_apply_data(1);
+
+    x->apply(y.get(), expected.get());
+    dx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, r<value_type>::value);
+}
+
+
+TEST_F(Ell, SingleVectorAdvancedApplyIsEquivalentToRef)
+{
+    set_up_apply_data(1);
+
+    x->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, r<value_type>::value);
+}

From 174c3fdc726ce1a896939dfb4f6a335b3b6ed25f Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Sun, 8 Oct 2023 12:03:36 +0200
Subject: [PATCH 06/18] Add DPCPP kernels and tests

Co-authored-by: Phuong Nguyen <phuong.nguyen@icl.utk.edu>
---
 core/test/utils/batch_helpers.hpp      |  2 -
 dpcpp/matrix/batch_ell_kernels.dp.cpp  | 84 ++++++++++++++++++++++++--
 dpcpp/matrix/batch_ell_kernels.hpp.inc | 79 ++++++++++++++++++++++++
 dpcpp/matrix/batch_struct.hpp          | 34 +++++++++++
 test/matrix/CMakeLists.txt             |  2 +-
 test/matrix/batch_ell_kernels.cpp      | 26 ++++++--
 6 files changed, 213 insertions(+), 14 deletions(-)
 create mode 100644 dpcpp/matrix/batch_ell_kernels.hpp.inc

diff --git a/core/test/utils/batch_helpers.hpp b/core/test/utils/batch_helpers.hpp
index 4cf9d4973e2..b040691999e 100644
--- a/core/test/utils/batch_helpers.hpp
+++ b/core/test/utils/batch_helpers.hpp
@@ -83,8 +83,6 @@ std::unique_ptr<MatrixType> generate_random_batch_matrix(
         exec, batch_dim<2>(num_batch_items, dim<2>(num_rows, num_cols)),
         std::forward<MatrixArgs>(args)...);
 
-    // TODO: Need to preserve sparsity pattern across batch items for batched
-    // sparse matrix formats
     for (size_type b = 0; b < num_batch_items; b++) {
         auto rand_mat =
             generate_random_matrix<typename MatrixType::unbatch_type>(
diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp
index cdcd5abd024..1ed83d79630 100644
--- a/dpcpp/matrix/batch_ell_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "core/matrix/batch_dense_kernels.hpp"
+#include "core/matrix/batch_ell_kernels.hpp"
 
 
 #include <algorithm>
@@ -42,7 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/batch_dense.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
 
 
 #include "core/base/batch_struct.hpp"
@@ -71,14 +71,48 @@ namespace dpcpp {
 namespace batch_ell {
 
 
-// #include "dpcpp/matrix/batch_dense_kernels.hpp.inc"
+#include "dpcpp/matrix/batch_ell_kernels.hpp.inc"
 
 
 template <typename ValueType, typename IndexType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const batch::matrix::Ell<ValueType, IndexType>* mat,
                   const batch::MultiVector<ValueType>* b,
-                  batch::MultiVector<ValueType>* x) GKO_NOT_IMPLEMENTED;
+                  batch::MultiVector<ValueType>* x)
+{
+    const size_type num_rows = mat->get_common_size()[0];
+    const size_type num_cols = mat->get_common_size()[1];
+
+    const auto num_batch_items = mat->get_num_batch_items();
+    auto device = exec->get_queue()->get_device();
+    auto group_size =
+        device.get_info<sycl::info::device::max_work_group_size>();
+
+    const dim3 block(group_size);
+    const dim3 grid(num_batch_items);
+    const auto x_ub = get_batch_struct(x);
+    const auto b_ub = get_batch_struct(b);
+    const auto mat_ub = get_batch_struct(mat);
+    if (b_ub.num_rhs > 1) {
+        GKO_NOT_IMPLEMENTED;
+    }
+
+    // Launch a kernel that has nbatches blocks, each block has max group size
+    (exec->get_queue())->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=
+        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                            config::warp_size)]] {
+                auto group = item_ct1.get_group();
+                auto group_id = group.get_group_linear_id();
+                const auto mat_b =
+                    batch::matrix::extract_batch_item(mat_ub, group_id);
+                const auto b_b = batch::extract_batch_item(b_ub, group_id);
+                const auto x_b = batch::extract_batch_item(x_ub, group_id);
+                simple_apply_kernel(mat_b, b_b, x_b, item_ct1);
+            });
+    });
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
@@ -90,7 +124,47 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
                     const batch::matrix::Ell<ValueType, IndexType>* mat,
                     const batch::MultiVector<ValueType>* b,
                     const batch::MultiVector<ValueType>* beta,
-                    batch::MultiVector<ValueType>* x) GKO_NOT_IMPLEMENTED;
+                    batch::MultiVector<ValueType>* x)
+{
+    const auto mat_ub = get_batch_struct(mat);
+    const auto b_ub = get_batch_struct(b);
+    const auto x_ub = get_batch_struct(x);
+    const auto alpha_ub = get_batch_struct(alpha);
+    const auto beta_ub = get_batch_struct(beta);
+
+    if (b_ub.num_rhs > 1) {
+        GKO_NOT_IMPLEMENTED;
+    }
+
+    const auto num_batch_items = mat_ub.num_batch_items;
+    auto device = exec->get_queue()->get_device();
+    auto group_size =
+        device.get_info<sycl::info::device::max_work_group_size>();
+
+    const dim3 block(group_size);
+    const dim3 grid(num_batch_items);
+
+    // Launch a kernel that has nbatches blocks, each block has max group size
+    (exec->get_queue())->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=
+        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                            config::warp_size)]] {
+                auto group = item_ct1.get_group();
+                auto group_id = group.get_group_linear_id();
+                const auto mat_b =
+                    batch::matrix::extract_batch_item(mat_ub, group_id);
+                const auto b_b = batch::extract_batch_item(b_ub, group_id);
+                const auto x_b = batch::extract_batch_item(x_ub, group_id);
+                const auto alpha_b =
+                    batch::extract_batch_item(alpha_ub, group_id);
+                const auto beta_b =
+                    batch::extract_batch_item(beta_ub, group_id);
+                advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b,
+                                      item_ct1);
+            });
+    });
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
diff --git a/dpcpp/matrix/batch_ell_kernels.hpp.inc b/dpcpp/matrix/batch_ell_kernels.hpp.inc
new file mode 100644
index 00000000000..1048f2f8ff8
--- /dev/null
+++ b/dpcpp/matrix/batch_ell_kernels.hpp.inc
@@ -0,0 +1,79 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename ValueType>
+__dpct_inline__ void simple_apply_kernel(
+    const gko::batch::matrix::batch_ell::batch_item<const ValueType>& mat,
+    const gko::batch::multi_vector::batch_item<const ValueType>& b,
+    const gko::batch::multi_vector::batch_item<ValueType>& x,
+    sycl::nd_item<3>& item_ct1)
+{
+    for (int tidx = item_ct1.get_local_linear_id(); tidx < mat.num_rows;
+         tidx += item_ct1.get_local_range().size()) {
+        auto temp = zero<ValueType>();
+        for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) {
+            const auto col_idx = mat.col_idxs[tidx + idx * mat.stride];
+            if (col_idx < idx)
+                break;
+            else
+                temp += mat.values[tidx + idx * mat.stride] *
+                        b.values[col_idx * b.stride];
+        }
+        x.values[tidx * x.stride] = temp;
+    }
+}
+
+
+template <typename ValueType>
+__dpct_inline__ void advanced_apply_kernel(
+    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
+    const gko::batch::matrix::batch_ell::batch_item<const ValueType>& mat,
+    const gko::batch::multi_vector::batch_item<const ValueType>& b,
+    const gko::batch::multi_vector::batch_item<const ValueType>& beta,
+    const gko::batch::multi_vector::batch_item<ValueType>& x,
+    sycl::nd_item<3>& item_ct1)
+{
+    for (int tidx = item_ct1.get_local_linear_id(); tidx < mat.num_rows;
+         tidx += item_ct1.get_local_range().size()) {
+        auto temp = zero<ValueType>();
+        for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) {
+            const auto col_idx = mat.col_idxs[tidx + idx * mat.stride];
+            if (col_idx < idx)
+                break;
+            else
+                temp += alpha.values[0] * mat.values[tidx + idx * mat.stride] *
+                        b.values[col_idx * b.stride];
+        }
+        x.values[tidx * x.stride] =
+            temp + beta.values[0] * x.values[tidx * x.stride];
+    }
+}
diff --git a/dpcpp/matrix/batch_struct.hpp b/dpcpp/matrix/batch_struct.hpp
index b0393daf55d..35ff1148dd5 100644
--- a/dpcpp/matrix/batch_struct.hpp
+++ b/dpcpp/matrix/batch_struct.hpp
@@ -86,6 +86,40 @@ inline batch::matrix::dense::uniform_batch<ValueType> get_batch_struct(
 }
 
 
+/**
+ * Generates an immutable uniform batch struct from a batch of ell matrices.
+ */
+template <typename ValueType>
+inline batch::matrix::batch_ell::uniform_batch<const ValueType>
+get_batch_struct(const batch::matrix::Ell<ValueType, int32>* const op)
+{
+    return {op->get_const_values(),
+            op->get_const_col_idxs(),
+            op->get_num_batch_items(),
+            static_cast<int>(op->get_common_size()[0]),
+            static_cast<int>(op->get_common_size()[0]),
+            static_cast<int>(op->get_common_size()[1]),
+            static_cast<int>(op->get_num_stored_elements_per_row())};
+}
+
+
+/**
+ * Generates a uniform batch struct from a batch of ell matrices.
+ */
+template <typename ValueType>
+inline batch::matrix::batch_ell::uniform_batch<ValueType> get_batch_struct(
+    batch::matrix::Ell<ValueType, int32>* const op)
+{
+    return {op->get_values(),
+            op->get_col_idxs(),
+            op->get_num_batch_items(),
+            static_cast<int>(op->get_common_size()[0]),
+            static_cast<int>(op->get_common_size()[0]),
+            static_cast<int>(op->get_common_size()[1]),
+            static_cast<int>(op->get_num_stored_elements_per_row())};
+}
+
+
 }  // namespace dpcpp
 }  // namespace kernels
 }  // namespace gko
diff --git a/test/matrix/CMakeLists.txt b/test/matrix/CMakeLists.txt
index f1c91e615e7..a03a0a0bb4e 100644
--- a/test/matrix/CMakeLists.txt
+++ b/test/matrix/CMakeLists.txt
@@ -1,5 +1,5 @@
 ginkgo_create_common_test(batch_dense_kernels)
-ginkgo_create_common_test(batch_ell_kernels DISABLE_EXECUTORS dpcpp)
+ginkgo_create_common_test(batch_ell_kernels)
 ginkgo_create_common_device_test(csr_kernels)
 ginkgo_create_common_test(csr_kernels2)
 ginkgo_create_common_test(coo_kernels)
diff --git a/test/matrix/batch_ell_kernels.cpp b/test/matrix/batch_ell_kernels.cpp
index 9629a2263ff..bc1e0c7fb42 100644
--- a/test/matrix/batch_ell_kernels.cpp
+++ b/test/matrix/batch_ell_kernels.cpp
@@ -63,22 +63,36 @@ class Ell : public CommonTestFixture {
     template <typename MtxType>
     std::unique_ptr<MtxType> gen_mtx(const gko::size_type num_batch_items,
                                      gko::size_type num_rows,
-                                     gko::size_type num_cols)
+                                     gko::size_type num_cols,
+                                     int num_elems_per_row)
     {
         return gko::test::generate_random_batch_matrix<MtxType>(
+            num_batch_items, num_rows, num_cols,
+            std::uniform_int_distribution<>(num_elems_per_row,
+                                            num_elems_per_row),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref,
+            num_elems_per_row);
+    }
+
+    std::unique_ptr<MVec> gen_mvec(const gko::size_type num_batch_items,
+                                   gko::size_type num_rows,
+                                   gko::size_type num_cols)
+    {
+        return gko::test::generate_random_batch_matrix<MVec>(
             num_batch_items, num_rows, num_cols,
             std::uniform_int_distribution<>(num_cols, num_cols),
             std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
     }
 
-    void set_up_apply_data(gko::size_type num_vecs = 1)
+    void set_up_apply_data(gko::size_type num_vecs = 1,
+                           int num_elems_per_row = 5)
     {
         const int num_rows = 252;
         const int num_cols = 32;
-        x = gen_mtx<Mtx>(batch_size, num_rows, num_cols);
-        y = gen_mtx<MVec>(batch_size, num_cols, num_vecs);
-        alpha = gen_mtx<MVec>(batch_size, 1, 1);
-        beta = gen_mtx<MVec>(batch_size, 1, 1);
+        x = gen_mtx<Mtx>(batch_size, num_rows, num_cols, num_elems_per_row);
+        y = gen_mvec(batch_size, num_cols, num_vecs);
+        alpha = gen_mvec(batch_size, 1, 1);
+        beta = gen_mvec(batch_size, 1, 1);
         dx = gko::clone(exec, x);
         dy = gko::clone(exec, y);
         dalpha = gko::clone(exec, alpha);

From f4168f95277eb6d6e30c8e43e4ab7e31ce63da16 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Sun, 8 Oct 2023 12:15:00 +0200
Subject: [PATCH 07/18] Update docs

---
 include/ginkgo/core/matrix/batch_ell.hpp | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp
index 490f7a7d4b0..48a3a6d9831 100644
--- a/include/ginkgo/core/matrix/batch_ell.hpp
+++ b/include/ginkgo/core/matrix/batch_ell.hpp
@@ -55,19 +55,15 @@ namespace matrix {
 
 
 /**
- * Ell is a batch matrix format which explicitly stores all values of the
- * matrix in each of the batches.
+ * Ell is a sparse matrix format that stores the same number of nonzeros in each
+ * row, enabling coalesced accesses. It is suitable for sparsity patterns that
+ * have a similar number of nonzeros in every row. The values are stored in a
+ * column-major fashion similar to the monolithic gko::matrix::Ell class. It is
+ * also assumed that the sparsity pattern of all the items in the batch is the
+ * same and therefore only a single copy of the sparsity pattern is stored.
  *
- * The values in each of the batches are stored in row-major format (values
- * belonging to the same row appear consecutive in the memory and the values of
- * each batch item are also stored consecutively in memory).
- *
- * @note Though the storage layout is similar to the multi-vector object, the
- * class semantics and the operations it aims to provide is different. Hence it
- * is recommended to create multi-vector objects if the user means to view the
- * data as a set of vectors.
- *
- * @tparam ValueType  precision of matrix elements
+ * @tparam ValueType  value precision of matrix elements
+ * @tparam IndexType  index precision of matrix elements
  *
  * @ingroup batch_ell
  * @ingroup mat_formats

From 7cb4c51c2135404ff67cbbc0322e3401530379eb Mon Sep 17 00:00:00 2001
From: ginkgo-bot <ginkgo.library@gmail.com>
Date: Sun, 8 Oct 2023 10:49:57 +0000
Subject: [PATCH 08/18] Format files

Co-authored-by: Pratik Nayak <pratikvn@pm.me>
---
 dpcpp/matrix/batch_ell_kernels.dp.cpp         | 54 +++++++++----------
 .../ginkgo/core/base/batch_multi_vector.hpp   |  4 +-
 include/ginkgo/core/matrix/batch_ell.hpp      |  8 +--
 include/ginkgo/ginkgo.hpp                     |  1 +
 4 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp
index 1ed83d79630..1d1210cc270 100644
--- a/dpcpp/matrix/batch_ell_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp
@@ -100,17 +100,17 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     // Launch a kernel that has nbatches blocks, each block has max group size
     (exec->get_queue())->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(
-            sycl_nd_range(grid, block), [=
-        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
-                                            config::warp_size)]] {
-                auto group = item_ct1.get_group();
-                auto group_id = group.get_group_linear_id();
-                const auto mat_b =
-                    batch::matrix::extract_batch_item(mat_ub, group_id);
-                const auto b_b = batch::extract_batch_item(b_ub, group_id);
-                const auto x_b = batch::extract_batch_item(x_ub, group_id);
-                simple_apply_kernel(mat_b, b_b, x_b, item_ct1);
-            });
+            sycl_nd_range(grid, block),
+            [=](sycl::nd_item<3> item_ct1)
+                [[sycl::reqd_sub_group_size(config::warp_size)]] {
+                    auto group = item_ct1.get_group();
+                    auto group_id = group.get_group_linear_id();
+                    const auto mat_b =
+                        batch::matrix::extract_batch_item(mat_ub, group_id);
+                    const auto b_b = batch::extract_batch_item(b_ub, group_id);
+                    const auto x_b = batch::extract_batch_item(x_ub, group_id);
+                    simple_apply_kernel(mat_b, b_b, x_b, item_ct1);
+                });
     });
 }
 
@@ -147,22 +147,22 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     // Launch a kernel that has nbatches blocks, each block has max group size
     (exec->get_queue())->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(
-            sycl_nd_range(grid, block), [=
-        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
-                                            config::warp_size)]] {
-                auto group = item_ct1.get_group();
-                auto group_id = group.get_group_linear_id();
-                const auto mat_b =
-                    batch::matrix::extract_batch_item(mat_ub, group_id);
-                const auto b_b = batch::extract_batch_item(b_ub, group_id);
-                const auto x_b = batch::extract_batch_item(x_ub, group_id);
-                const auto alpha_b =
-                    batch::extract_batch_item(alpha_ub, group_id);
-                const auto beta_b =
-                    batch::extract_batch_item(beta_ub, group_id);
-                advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b,
-                                      item_ct1);
-            });
+            sycl_nd_range(grid, block),
+            [=](sycl::nd_item<3> item_ct1)
+                [[sycl::reqd_sub_group_size(config::warp_size)]] {
+                    auto group = item_ct1.get_group();
+                    auto group_id = group.get_group_linear_id();
+                    const auto mat_b =
+                        batch::matrix::extract_batch_item(mat_ub, group_id);
+                    const auto b_b = batch::extract_batch_item(b_ub, group_id);
+                    const auto x_b = batch::extract_batch_item(x_ub, group_id);
+                    const auto alpha_b =
+                        batch::extract_batch_item(alpha_ub, group_id);
+                    const auto beta_b =
+                        batch::extract_batch_item(beta_ub, group_id);
+                    advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b,
+                                          item_ct1);
+                });
     });
 }
 
diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp
index 45ba0686468..9a4b8d5cf1d 100644
--- a/include/ginkgo/core/base/batch_multi_vector.hpp
+++ b/include/ginkgo/core/base/batch_multi_vector.hpp
@@ -212,8 +212,8 @@ class MultiVector
      *       significantly more memory efficient than the non-constant version,
      *       so always prefer this version.
      */
-    const value_type* get_const_values_for_item(size_type batch_id) const
-        noexcept
+    const value_type* get_const_values_for_item(
+        size_type batch_id) const noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
         return values_.get_const_data() + this->get_cumulative_offset(batch_id);
diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp
index 48a3a6d9831..5cb5f73dec5 100644
--- a/include/ginkgo/core/matrix/batch_ell.hpp
+++ b/include/ginkgo/core/matrix/batch_ell.hpp
@@ -214,8 +214,8 @@ class Ell final
      *       significantly more memory efficient than the non-constant version,
      *       so always prefer this version.
      */
-    const index_type* get_const_col_idxs_for_item(size_type batch_id) const
-        noexcept
+    const index_type* get_const_col_idxs_for_item(
+        size_type batch_id) const noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
         return col_idxs_.get_const_data();
@@ -243,8 +243,8 @@ class Ell final
      *       significantly more memory efficient than the non-constant version,
      *       so always prefer this version.
      */
-    const value_type* get_const_values_for_item(size_type batch_id) const
-        noexcept
+    const value_type* get_const_values_for_item(
+        size_type batch_id) const noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
         return values_.get_const_data() +
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index 8bb29242e88..ad90e264189 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -109,6 +109,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/log/stream.hpp>
 
 #include <ginkgo/core/matrix/batch_dense.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>

From 88b6e3aee599f26e339d1b7357c0509bd28afdd1 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Tue, 10 Oct 2023 17:15:48 +0200
Subject: [PATCH 09/18] Some general fixes.

---
 .../cuda_hip/matrix/batch_ell_kernels.hpp.inc | 13 ++-
 core/matrix/batch_ell.cpp                     | 32 +------
 core/matrix/batch_struct.hpp                  | 20 ++---
 core/test/matrix/batch_ell.cpp                |  8 +-
 cuda/matrix/batch_dense_kernels.cu            |  3 +-
 cuda/matrix/batch_ell_kernels.cu              |  6 +-
 cuda/matrix/batch_struct.hpp                  | 23 ++---
 dpcpp/matrix/batch_ell_kernels.dp.cpp         | 62 +++++++------
 dpcpp/matrix/batch_ell_kernels.hpp.inc        |  4 +-
 dpcpp/matrix/batch_struct.hpp                 | 23 ++---
 hip/matrix/batch_ell_kernels.hip.cpp          |  6 +-
 hip/matrix/batch_struct.hip.hpp               | 23 ++---
 include/ginkgo/core/matrix/batch_ell.hpp      |  8 --
 omp/matrix/batch_dense_kernels.cpp            |  4 +-
 omp/matrix/batch_ell_kernels.cpp              |  4 +-
 reference/matrix/batch_dense_kernels.cpp      |  5 +-
 reference/matrix/batch_ell_kernels.cpp        |  5 +-
 reference/matrix/batch_ell_kernels.hpp.inc    |  4 +-
 reference/matrix/batch_struct.hpp             | 22 ++---
 reference/test/matrix/batch_ell_kernels.cpp   | 87 ++++++-------------
 test/matrix/batch_ell_kernels.cpp             | 59 ++++++-------
 test/test_install/test_install.cpp            |  9 +-
 22 files changed, 184 insertions(+), 246 deletions(-)

diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
index e55e7a60471..5c00358c5a0 100644
--- a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
@@ -33,7 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 template <typename ValueType>
 __device__ __forceinline__ void simple_apply(
-    const gko::batch::matrix::batch_ell::batch_item<const ValueType>& mat,
+    const gko::batch::matrix::ell::batch_item<const ValueType>& mat,
     const ValueType* const __restrict__ b, ValueType* const __restrict__ x)
 {
     const auto num_rows = mat.num_rows;
@@ -60,7 +60,7 @@ template <typename ValueType>
 __global__ __launch_bounds__(
     default_block_size,
     sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix::
-                                                      batch_ell::uniform_batch<
+                                                      ell::uniform_batch<
                                                           const ValueType>
                                                           mat,
                                                   const gko::batch::
@@ -88,7 +88,7 @@ __global__ __launch_bounds__(
 template <typename ValueType>
 __device__ __forceinline__ void advanced_apply(
     const ValueType alpha,
-    const gko::batch::matrix::batch_ell::batch_item<const ValueType>& mat,
+    const gko::batch::matrix::ell::batch_item<const ValueType>& mat,
     const ValueType* const __restrict__ b, const ValueType beta,
     ValueType* const __restrict__ x)
 {
@@ -121,10 +121,9 @@ __global__ __launch_bounds__(
                                                                 const ValueType>
                                                                 alpha,
                                                     const gko::batch::matrix::
-                                                        batch_ell::
-                                                            uniform_batch<
-                                                                const ValueType>
-                                                                mat,
+                                                        ell::uniform_batch<
+                                                            const ValueType>
+                                                            mat,
                                                     const gko::batch::
                                                         multi_vector::
                                                             uniform_batch<
diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp
index 0d903b10968..f421fdf2b49 100644
--- a/core/matrix/batch_ell.cpp
+++ b/core/matrix/batch_ell.cpp
@@ -104,22 +104,10 @@ template <typename ValueType, typename IndexType>
 std::unique_ptr<Ell<ValueType, IndexType>>
 Ell<ValueType, IndexType>::create_with_config_of(
     ptr_param<const Ell<ValueType, IndexType>> other)
-{
-    // De-referencing `other` before calling the functions (instead of
-    // using operator `->`) is currently required to be compatible with
-    // CUDA 10.1.
-    // Otherwise, it results in a compile error.
-    return (*other).create_with_same_config();
-}
-
-
-template <typename ValueType, typename IndexType>
-std::unique_ptr<Ell<ValueType, IndexType>>
-Ell<ValueType, IndexType>::create_with_same_config() const
 {
     return Ell<ValueType, IndexType>::create(
-        this->get_executor(), this->get_size(),
-        this->get_num_stored_elements_per_row());
+        other->get_executor(), other->get_size(),
+        other->get_num_stored_elements_per_row());
 }
 
 
@@ -163,12 +151,7 @@ template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::apply_impl(const MultiVector<ValueType>* b,
                                            MultiVector<ValueType>* x) const
 {
-    GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items());
-    GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items());
-
-    GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size());
-    GKO_ASSERT_EQUAL_ROWS(this->get_common_size(), x->get_common_size());
-    GKO_ASSERT_EQUAL_COLS(b->get_common_size(), x->get_common_size());
+    this->validate_application_parameters(b, x);
     this->get_executor()->run(ell::make_simple_apply(this, b, x));
 }
 
@@ -179,14 +162,7 @@ void Ell<ValueType, IndexType>::apply_impl(const MultiVector<ValueType>* alpha,
                                            const MultiVector<ValueType>* beta,
                                            MultiVector<ValueType>* x) const
 {
-    GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items());
-    GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items());
-
-    GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size());
-    GKO_ASSERT_EQUAL_ROWS(this->get_common_size(), x->get_common_size());
-    GKO_ASSERT_EQUAL_COLS(b->get_common_size(), x->get_common_size());
-    GKO_ASSERT_EQUAL_DIMENSIONS(alpha->get_common_size(), gko::dim<2>(1, 1));
-    GKO_ASSERT_EQUAL_DIMENSIONS(beta->get_common_size(), gko::dim<2>(1, 1));
+    this->validate_application_parameters(alpha, b, beta, x);
     this->get_executor()->run(
         ell::make_advanced_apply(alpha, this, b, beta, x));
 }
diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp
index 2eed40882bc..eeeeebd53d6 100644
--- a/core/matrix/batch_struct.hpp
+++ b/core/matrix/batch_struct.hpp
@@ -83,7 +83,7 @@ struct uniform_batch {
 }  // namespace dense
 
 
-namespace batch_ell {
+namespace ell {
 
 
 /**
@@ -109,7 +109,7 @@ struct batch_item {
 template <typename ValueType>
 struct uniform_batch {
     using value_type = ValueType;
-    using index_type = int;
+    using index_type = int32;
     using entry_type = batch_item<value_type>;
 
     ValueType* values;
@@ -127,7 +127,7 @@ struct uniform_batch {
 };
 
 
-}  // namespace batch_ell
+}  // namespace ell
 
 
 template <typename ValueType>
@@ -165,8 +165,8 @@ GKO_ATTRIBUTES GKO_INLINE dense::batch_item<ValueType> extract_batch_item(
 
 
 template <typename ValueType>
-GKO_ATTRIBUTES GKO_INLINE batch_ell::batch_item<const ValueType> to_const(
-    const batch_ell::batch_item<ValueType>& b)
+GKO_ATTRIBUTES GKO_INLINE ell::batch_item<const ValueType> to_const(
+    const ell::batch_item<ValueType>& b)
 {
     return {b.values,   b.col_idxs, b.stride,
             b.num_rows, b.num_cols, b.num_stored_elems_per_row};
@@ -174,8 +174,8 @@ GKO_ATTRIBUTES GKO_INLINE batch_ell::batch_item<const ValueType> to_const(
 
 
 template <typename ValueType>
-GKO_ATTRIBUTES GKO_INLINE batch_ell::uniform_batch<const ValueType> to_const(
-    const batch_ell::uniform_batch<ValueType>& ub)
+GKO_ATTRIBUTES GKO_INLINE ell::uniform_batch<const ValueType> to_const(
+    const ell::uniform_batch<ValueType>& ub)
 {
     return {ub.values,   ub.col_idxs, ub.num_batch_items,         ub.stride,
             ub.num_rows, ub.num_cols, ub.num_stored_elems_per_row};
@@ -183,8 +183,8 @@ GKO_ATTRIBUTES GKO_INLINE batch_ell::uniform_batch<const ValueType> to_const(
 
 
 template <typename ValueType>
-GKO_ATTRIBUTES GKO_INLINE batch_ell::batch_item<ValueType> extract_batch_item(
-    const batch_ell::uniform_batch<ValueType>& batch, const size_type batch_idx)
+GKO_ATTRIBUTES GKO_INLINE ell::batch_item<ValueType> extract_batch_item(
+    const ell::uniform_batch<ValueType>& batch, const size_type batch_idx)
 {
     return {batch.values +
                 batch_idx * batch.num_stored_elems_per_row * batch.num_rows,
@@ -196,7 +196,7 @@ GKO_ATTRIBUTES GKO_INLINE batch_ell::batch_item<ValueType> extract_batch_item(
 }
 
 template <typename ValueType>
-GKO_ATTRIBUTES GKO_INLINE batch_ell::batch_item<ValueType> extract_batch_item(
+GKO_ATTRIBUTES GKO_INLINE ell::batch_item<ValueType> extract_batch_item(
     ValueType* const batch_values, int* const batch_col_idxs, const int stride,
     const int num_rows, const int num_cols, int num_elems_per_row,
     const size_type batch_idx)
diff --git a/core/test/matrix/batch_ell.cpp b/core/test/matrix/batch_ell.cpp
index 2830705bf5f..e4dcab23917 100644
--- a/core/test/matrix/batch_ell.cpp
+++ b/core/test/matrix/batch_ell.cpp
@@ -144,6 +144,7 @@ TYPED_TEST(Ell, SparseMtxKnowsItsSizeAndValues)
 TYPED_TEST(Ell, CanBeEmpty)
 {
     auto empty = gko::batch::matrix::Ell<TypeParam>::create(this->exec);
+
     this->assert_empty(empty.get());
 }
 
@@ -151,6 +152,7 @@ TYPED_TEST(Ell, CanBeEmpty)
 TYPED_TEST(Ell, ReturnsNullValuesArrayWhenEmpty)
 {
     auto empty = gko::batch::matrix::Ell<TypeParam>::create(this->exec);
+
     ASSERT_EQ(empty->get_const_values(), nullptr);
 }
 
@@ -284,7 +286,6 @@ TYPED_TEST(Ell, CanBeConstructedFromEllMatrices)
     using value_type = typename TestFixture::value_type;
     using EllMtx = typename TestFixture::EllMtx;
     using size_type = gko::size_type;
-
     auto mat1 = gko::initialize<EllMtx>({{-1.0, 0.0, 0.0}, {0.0, 2.5, 3.5}},
                                         this->exec);
     auto mat2 =
@@ -304,15 +305,14 @@ TYPED_TEST(Ell, CanBeConstructedFromEllMatricesByDuplication)
     using index_type = int;
     using EllMtx = typename TestFixture::EllMtx;
     using size_type = gko::size_type;
-
     auto mat1 =
         gko::initialize<EllMtx>({{1.0, 0.0, 0.0}, {0.0, 2.0, 0.0}}, this->exec);
-
     auto bat_m =
         gko::batch::create_from_item<gko::batch::matrix::Ell<value_type>>(
             this->exec,
             std::vector<EllMtx*>{mat1.get(), mat1.get(), mat1.get()},
             mat1->get_num_stored_elements_per_row());
+
     auto m = gko::batch::create_from_item<gko::batch::matrix::Ell<value_type>>(
         this->exec, 3, mat1.get(), mat1->get_num_stored_elements_per_row());
 
@@ -326,7 +326,6 @@ TYPED_TEST(Ell, CanBeConstructedByDuplicatingEllMatrices)
     using index_type = int;
     using EllMtx = typename TestFixture::EllMtx;
     using size_type = gko::size_type;
-
     auto mat1 = gko::initialize<EllMtx>({{-1.0, 0.0, 0.0}, {0.0, 2.5, 0.0}},
                                         this->exec);
     auto mat2 =
@@ -372,6 +371,7 @@ TYPED_TEST(Ell, CanBeListConstructed)
 {
     using value_type = typename TestFixture::value_type;
     using index_type = int;
+
     auto m = gko::batch::initialize<gko::batch::matrix::Ell<TypeParam>>(
         {{0.0, -1.0}, {1.0, 0.0}}, this->exec);
 
diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu
index dd82e15b8cc..c693a3ae861 100644
--- a/cuda/matrix/batch_dense_kernels.cu
+++ b/cuda/matrix/batch_dense_kernels.cu
@@ -36,7 +36,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <thrust/functional.h>
 
 
-#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
 
 
 #include "core/base/batch_struct.hpp"
diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu
index ee6a99f04ca..6dd268a2d8e 100644
--- a/cuda/matrix/batch_ell_kernels.cu
+++ b/cuda/matrix/batch_ell_kernels.cu
@@ -34,18 +34,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <thrust/functional.h>
-#include <thrust/transform.h>
 
 
-#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
 
 
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "cuda/base/batch_struct.hpp"
 #include "cuda/base/config.hpp"
-#include "cuda/base/cublas_bindings.hpp"
-#include "cuda/base/pointer_mode_guard.hpp"
 #include "cuda/base/thrust.cuh"
 #include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp
index 7a6a4ac7f00..e2db1ea6e97 100644
--- a/cuda/matrix/batch_struct.hpp
+++ b/cuda/matrix/batch_struct.hpp
@@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/matrix/batch_dense.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
 
 
 #include "core/base/batch_struct.hpp"
@@ -91,16 +92,16 @@ get_batch_struct(batch::matrix::Dense<ValueType>* const op)
  * Generates an immutable uniform batch struct from a batch of ell matrices.
  */
 template <typename ValueType>
-inline batch::matrix::batch_ell::uniform_batch<const cuda_type<ValueType>>
+inline batch::matrix::ell::uniform_batch<const cuda_type<ValueType>>
 get_batch_struct(const batch::matrix::Ell<ValueType, int32>* const op)
 {
     return {as_cuda_type(op->get_const_values()),
             op->get_const_col_idxs(),
             op->get_num_batch_items(),
-            static_cast<int>(op->get_common_size()[0]),
-            static_cast<int>(op->get_common_size()[0]),
-            static_cast<int>(op->get_common_size()[1]),
-            static_cast<int>(op->get_num_stored_elements_per_row())};
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_num_stored_elements_per_row())};
 }
 
 
@@ -108,16 +109,16 @@ get_batch_struct(const batch::matrix::Ell<ValueType, int32>* const op)
  * Generates a uniform batch struct from a batch of ell matrices.
  */
 template <typename ValueType>
-inline batch::matrix::batch_ell::uniform_batch<cuda_type<ValueType>>
-get_batch_struct(batch::matrix::Ell<ValueType, int32>* const op)
+inline batch::matrix::ell::uniform_batch<cuda_type<ValueType>> get_batch_struct(
+    batch::matrix::Ell<ValueType, int32>* const op)
 {
     return {as_cuda_type(op->get_values()),
             op->get_col_idxs(),
             op->get_num_batch_items(),
-            static_cast<int>(op->get_common_size()[0]),
-            static_cast<int>(op->get_common_size()[0]),
-            static_cast<int>(op->get_common_size()[1]),
-            static_cast<int>(op->get_num_stored_elements_per_row())};
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_num_stored_elements_per_row())};
 }
 
 
diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp
index 1d1210cc270..fca265eceb0 100644
--- a/dpcpp/matrix/batch_ell_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp
@@ -39,17 +39,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <CL/sycl.hpp>
 
 
-#include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
 
 #include "core/base/batch_struct.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "dpcpp/base/batch_struct.hpp"
-#include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
@@ -98,19 +94,19 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     }
 
     // Launch a kernel that has nbatches blocks, each block has max group size
-    (exec->get_queue())->submit([&](sycl::handler& cgh) {
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(
-            sycl_nd_range(grid, block),
-            [=](sycl::nd_item<3> item_ct1)
-                [[sycl::reqd_sub_group_size(config::warp_size)]] {
-                    auto group = item_ct1.get_group();
-                    auto group_id = group.get_group_linear_id();
-                    const auto mat_b =
-                        batch::matrix::extract_batch_item(mat_ub, group_id);
-                    const auto b_b = batch::extract_batch_item(b_ub, group_id);
-                    const auto x_b = batch::extract_batch_item(x_ub, group_id);
-                    simple_apply_kernel(mat_b, b_b, x_b, item_ct1);
-                });
+            sycl_nd_range(grid, block), [=
+        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                            config::warp_size)]] {
+                auto group = item_ct1.get_group();
+                auto group_id = group.get_group_linear_id();
+                const auto mat_b =
+                    batch::matrix::extract_batch_item(mat_ub, group_id);
+                const auto b_b = batch::extract_batch_item(b_ub, group_id);
+                const auto x_b = batch::extract_batch_item(x_ub, group_id);
+                simple_apply_kernel(mat_b, b_b, x_b, item_ct1);
+            });
     });
 }
 
@@ -145,24 +141,24 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     const dim3 grid(num_batch_items);
 
     // Launch a kernel that has nbatches blocks, each block has max group size
-    (exec->get_queue())->submit([&](sycl::handler& cgh) {
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(
-            sycl_nd_range(grid, block),
-            [=](sycl::nd_item<3> item_ct1)
-                [[sycl::reqd_sub_group_size(config::warp_size)]] {
-                    auto group = item_ct1.get_group();
-                    auto group_id = group.get_group_linear_id();
-                    const auto mat_b =
-                        batch::matrix::extract_batch_item(mat_ub, group_id);
-                    const auto b_b = batch::extract_batch_item(b_ub, group_id);
-                    const auto x_b = batch::extract_batch_item(x_ub, group_id);
-                    const auto alpha_b =
-                        batch::extract_batch_item(alpha_ub, group_id);
-                    const auto beta_b =
-                        batch::extract_batch_item(beta_ub, group_id);
-                    advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b,
-                                          item_ct1);
-                });
+            sycl_nd_range(grid, block), [=
+        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                            config::warp_size)]] {
+                auto group = item_ct1.get_group();
+                auto group_id = group.get_group_linear_id();
+                const auto mat_b =
+                    batch::matrix::extract_batch_item(mat_ub, group_id);
+                const auto b_b = batch::extract_batch_item(b_ub, group_id);
+                const auto x_b = batch::extract_batch_item(x_ub, group_id);
+                const auto alpha_b =
+                    batch::extract_batch_item(alpha_ub, group_id);
+                const auto beta_b =
+                    batch::extract_batch_item(beta_ub, group_id);
+                advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b,
+                                      item_ct1);
+            });
     });
 }
 
diff --git a/dpcpp/matrix/batch_ell_kernels.hpp.inc b/dpcpp/matrix/batch_ell_kernels.hpp.inc
index 1048f2f8ff8..7500ae9e060 100644
--- a/dpcpp/matrix/batch_ell_kernels.hpp.inc
+++ b/dpcpp/matrix/batch_ell_kernels.hpp.inc
@@ -32,7 +32,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 template <typename ValueType>
 __dpct_inline__ void simple_apply_kernel(
-    const gko::batch::matrix::batch_ell::batch_item<const ValueType>& mat,
+    const gko::batch::matrix::ell::batch_item<const ValueType>& mat,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
     const gko::batch::multi_vector::batch_item<ValueType>& x,
     sycl::nd_item<3>& item_ct1)
@@ -56,7 +56,7 @@ __dpct_inline__ void simple_apply_kernel(
 template <typename ValueType>
 __dpct_inline__ void advanced_apply_kernel(
     const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
-    const gko::batch::matrix::batch_ell::batch_item<const ValueType>& mat,
+    const gko::batch::matrix::ell::batch_item<const ValueType>& mat,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
     const gko::batch::multi_vector::batch_item<const ValueType>& beta,
     const gko::batch::multi_vector::batch_item<ValueType>& x,
diff --git a/dpcpp/matrix/batch_struct.hpp b/dpcpp/matrix/batch_struct.hpp
index 35ff1148dd5..f857653e05e 100644
--- a/dpcpp/matrix/batch_struct.hpp
+++ b/dpcpp/matrix/batch_struct.hpp
@@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/matrix/batch_dense.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
 
 
 #include "core/base/batch_struct.hpp"
@@ -90,16 +91,16 @@ inline batch::matrix::dense::uniform_batch<ValueType> get_batch_struct(
  * Generates an immutable uniform batch struct from a batch of ell matrices.
  */
 template <typename ValueType>
-inline batch::matrix::batch_ell::uniform_batch<const ValueType>
-get_batch_struct(const batch::matrix::Ell<ValueType, int32>* const op)
+inline batch::matrix::ell::uniform_batch<const ValueType> get_batch_struct(
+    const batch::matrix::Ell<ValueType, int32>* const op)
 {
     return {op->get_const_values(),
             op->get_const_col_idxs(),
             op->get_num_batch_items(),
-            static_cast<int>(op->get_common_size()[0]),
-            static_cast<int>(op->get_common_size()[0]),
-            static_cast<int>(op->get_common_size()[1]),
-            static_cast<int>(op->get_num_stored_elements_per_row())};
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_num_stored_elements_per_row())};
 }
 
 
@@ -107,16 +108,16 @@ get_batch_struct(const batch::matrix::Ell<ValueType, int32>* const op)
  * Generates a uniform batch struct from a batch of ell matrices.
  */
 template <typename ValueType>
-inline batch::matrix::batch_ell::uniform_batch<ValueType> get_batch_struct(
+inline batch::matrix::ell::uniform_batch<ValueType> get_batch_struct(
     batch::matrix::Ell<ValueType, int32>* const op)
 {
     return {op->get_values(),
             op->get_col_idxs(),
             op->get_num_batch_items(),
-            static_cast<int>(op->get_common_size()[0]),
-            static_cast<int>(op->get_common_size()[0]),
-            static_cast<int>(op->get_common_size()[1]),
-            static_cast<int>(op->get_num_stored_elements_per_row())};
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_num_stored_elements_per_row())};
 }
 
 
diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp
index fdd52c38f57..5c6d5179a21 100644
--- a/hip/matrix/batch_ell_kernels.hip.cpp
+++ b/hip/matrix/batch_ell_kernels.hip.cpp
@@ -35,18 +35,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <hip/hip_runtime.h>
 #include <thrust/functional.h>
-#include <thrust/transform.h>
 
 
-#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
 
 
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "hip/base/batch_struct.hip.hpp"
 #include "hip/base/config.hip.hpp"
-#include "hip/base/hipblas_bindings.hip.hpp"
-#include "hip/base/pointer_mode_guard.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp
index a43d7d058b0..6f15b2d966a 100644
--- a/hip/matrix/batch_struct.hip.hpp
+++ b/hip/matrix/batch_struct.hip.hpp
@@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/matrix/batch_dense.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
 
 
 #include "core/base/batch_struct.hpp"
@@ -91,16 +92,16 @@ get_batch_struct(batch::matrix::Dense<ValueType>* const op)
  * Generates an immutable uniform batch struct from a batch of ell matrices.
  */
 template <typename ValueType>
-inline batch::matrix::batch_ell::uniform_batch<const hip_type<ValueType>>
+inline batch::matrix::ell::uniform_batch<const hip_type<ValueType>>
 get_batch_struct(const batch::matrix::Ell<ValueType, int32>* const op)
 {
     return {as_hip_type(op->get_const_values()),
             op->get_const_col_idxs(),
             op->get_num_batch_items(),
-            static_cast<int>(op->get_common_size()[0]),
-            static_cast<int>(op->get_common_size()[0]),
-            static_cast<int>(op->get_common_size()[1]),
-            static_cast<int>(op->get_num_stored_elements_per_row())};
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_num_stored_elements_per_row())};
 }
 
 
@@ -108,16 +109,16 @@ get_batch_struct(const batch::matrix::Ell<ValueType, int32>* const op)
  * Generates a uniform batch struct from a batch of ell matrices.
  */
 template <typename ValueType>
-inline batch::matrix::batch_ell::uniform_batch<hip_type<ValueType>>
-get_batch_struct(batch::matrix::Ell<ValueType, int32>* const op)
+inline batch::matrix::ell::uniform_batch<hip_type<ValueType>> get_batch_struct(
+    batch::matrix::Ell<ValueType, int32>* const op)
 {
     return {as_hip_type(op->get_values()),
             op->get_col_idxs(),
             op->get_num_batch_items(),
-            static_cast<int>(op->get_common_size()[0]),
-            static_cast<int>(op->get_common_size()[0]),
-            static_cast<int>(op->get_common_size()[1]),
-            static_cast<int>(op->get_num_stored_elements_per_row())};
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_num_stored_elements_per_row())};
 }
 
 
diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp
index 5cb5f73dec5..6f3db1bb96b 100644
--- a/include/ginkgo/core/matrix/batch_ell.hpp
+++ b/include/ginkgo/core/matrix/batch_ell.hpp
@@ -356,14 +356,6 @@ class Ell final
                       col_idxs_.get_num_elems());
     }
 
-    /**
-     * Creates a Ell matrix with the same configuration as the callers
-     * matrix.
-     *
-     * @returns a Ell matrix with the same configuration as the caller.
-     */
-    std::unique_ptr<Ell> create_with_same_config() const;
-
     void apply_impl(const MultiVector<value_type>* b,
                     MultiVector<value_type>* x) const;
 
diff --git a/omp/matrix/batch_dense_kernels.cpp b/omp/matrix/batch_dense_kernels.cpp
index 2d0b7ed4d40..b91a4133dba 100644
--- a/omp/matrix/batch_dense_kernels.cpp
+++ b/omp/matrix/batch_dense_kernels.cpp
@@ -36,8 +36,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <algorithm>
 
 
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
 
 
 #include "core/base/batch_struct.hpp"
diff --git a/omp/matrix/batch_ell_kernels.cpp b/omp/matrix/batch_ell_kernels.cpp
index 20ea4614e7d..17710a97366 100644
--- a/omp/matrix/batch_ell_kernels.cpp
+++ b/omp/matrix/batch_ell_kernels.cpp
@@ -36,8 +36,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <algorithm>
 
 
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
 
 
 #include "core/base/batch_struct.hpp"
diff --git a/reference/matrix/batch_dense_kernels.cpp b/reference/matrix/batch_dense_kernels.cpp
index 3d7ef03a3bd..87d73bb8e34 100644
--- a/reference/matrix/batch_dense_kernels.cpp
+++ b/reference/matrix/batch_dense_kernels.cpp
@@ -36,9 +36,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <algorithm>
 
 
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/range_accessors.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
 
 
 #include "core/base/batch_struct.hpp"
diff --git a/reference/matrix/batch_ell_kernels.cpp b/reference/matrix/batch_ell_kernels.cpp
index a3f69827c02..1d3a0e1ef94 100644
--- a/reference/matrix/batch_ell_kernels.cpp
+++ b/reference/matrix/batch_ell_kernels.cpp
@@ -36,9 +36,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <algorithm>
 
 
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/range_accessors.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
 
 
 #include "core/base/batch_struct.hpp"
diff --git a/reference/matrix/batch_ell_kernels.hpp.inc b/reference/matrix/batch_ell_kernels.hpp.inc
index 41d0a00ddcd..44de2a57af9 100644
--- a/reference/matrix/batch_ell_kernels.hpp.inc
+++ b/reference/matrix/batch_ell_kernels.hpp.inc
@@ -32,7 +32,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 template <typename ValueType>
 inline void simple_apply_kernel(
-    const gko::batch::matrix::batch_ell::batch_item<const ValueType>& a,
+    const gko::batch::matrix::ell::batch_item<const ValueType>& a,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
     const gko::batch::multi_vector::batch_item<ValueType>& c)
 {
@@ -55,7 +55,7 @@ inline void simple_apply_kernel(
 template <typename ValueType>
 inline void advanced_apply_kernel(
     const ValueType alpha,
-    const gko::batch::matrix::batch_ell::batch_item<const ValueType>& a,
+    const gko::batch::matrix::ell::batch_item<const ValueType>& a,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
     const ValueType beta,
     const gko::batch::multi_vector::batch_item<ValueType>& c)
diff --git a/reference/matrix/batch_struct.hpp b/reference/matrix/batch_struct.hpp
index 3b562450ee0..fb0e08c16f5 100644
--- a/reference/matrix/batch_struct.hpp
+++ b/reference/matrix/batch_struct.hpp
@@ -95,16 +95,16 @@ inline batch::matrix::dense::uniform_batch<ValueType> get_batch_struct(
  * Generates an immutable uniform batch struct from a batch of ell matrices.
  */
 template <typename ValueType>
-inline batch::matrix::batch_ell::uniform_batch<const ValueType>
-get_batch_struct(const batch::matrix::Ell<ValueType, int32>* const op)
+inline batch::matrix::ell::uniform_batch<const ValueType> get_batch_struct(
+    const batch::matrix::Ell<ValueType, int32>* const op)
 {
     return {op->get_const_values(),
             op->get_const_col_idxs(),
             op->get_num_batch_items(),
-            static_cast<int>(op->get_common_size()[0]),
-            static_cast<int>(op->get_common_size()[0]),
-            static_cast<int>(op->get_common_size()[1]),
-            static_cast<int>(op->get_num_stored_elements_per_row())};
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_num_stored_elements_per_row())};
 }
 
 
@@ -112,16 +112,16 @@ get_batch_struct(const batch::matrix::Ell<ValueType, int32>* const op)
  * Generates a uniform batch struct from a batch of ell matrices.
  */
 template <typename ValueType>
-inline batch::matrix::batch_ell::uniform_batch<ValueType> get_batch_struct(
+inline batch::matrix::ell::uniform_batch<ValueType> get_batch_struct(
     batch::matrix::Ell<ValueType, int32>* const op)
 {
     return {op->get_values(),
             op->get_col_idxs(),
             op->get_num_batch_items(),
-            static_cast<int>(op->get_common_size()[0]),
-            static_cast<int>(op->get_common_size()[0]),
-            static_cast<int>(op->get_common_size()[1]),
-            static_cast<int>(op->get_num_stored_elements_per_row())};
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_num_stored_elements_per_row())};
 }
 
 
diff --git a/reference/test/matrix/batch_ell_kernels.cpp b/reference/test/matrix/batch_ell_kernels.cpp
index 76b681c69f7..8a5806a9513 100644
--- a/reference/test/matrix/batch_ell_kernels.cpp
+++ b/reference/test/matrix/batch_ell_kernels.cpp
@@ -58,15 +58,13 @@ class Ell : public ::testing::Test {
 protected:
     using value_type = T;
     using size_type = gko::size_type;
-    using Mtx = gko::batch::matrix::Ell<value_type>;
-    using MVec = gko::batch::MultiVector<value_type>;
+    using BMtx = gko::batch::matrix::Ell<value_type>;
+    using BMVec = gko::batch::MultiVector<value_type>;
     using EllMtx = gko::matrix::Ell<value_type>;
     using DenseMtx = gko::matrix::Dense<value_type>;
-    using ComplexMtx = gko::to_complex<Mtx>;
-    using RealMtx = gko::remove_complex<Mtx>;
     Ell()
         : exec(gko::ReferenceExecutor::create()),
-          mtx_0(gko::batch::initialize<Mtx>(
+          mtx_0(gko::batch::initialize<BMtx>(
               {{I<T>({1.0, -1.0, 1.5}), I<T>({-2.0, 2.0, 3.0})},
                {{1.0, -2.0, -0.5}, {1.0, -2.5, 4.0}}},
               exec)),
@@ -74,7 +72,7 @@ class Ell : public ::testing::Test {
               {I<T>({1.0, -1.0, 1.5}), I<T>({-2.0, 2.0, 3.0})}, exec)),
           mtx_01(gko::initialize<EllMtx>(
               {I<T>({1.0, -2.0, -0.5}), I<T>({1.0, -2.5, 4.0})}, exec)),
-          b_0(gko::batch::initialize<MVec>(
+          b_0(gko::batch::initialize<BMVec>(
               {{I<T>({1.0, 0.0, 1.0}), I<T>({2.0, 0.0, 1.0}),
                 I<T>({1.0, 0.0, 2.0})},
                {I<T>({-1.0, 1.0, 1.0}), I<T>({1.0, -1.0, 1.0}),
@@ -88,7 +86,7 @@ class Ell : public ::testing::Test {
               {I<T>({-1.0, 1.0, 1.0}), I<T>({1.0, -1.0, 1.0}),
                I<T>({1.0, 0.0, 2.0})},
               exec)),
-          x_0(gko::batch::initialize<MVec>(
+          x_0(gko::batch::initialize<BMVec>(
               {{I<T>({2.0, 0.0, 1.0}), I<T>({2.0, 0.0, 2.0})},
                {I<T>({-2.0, 1.0, 1.0}), I<T>({1.0, -1.0, -1.0})}},
               exec)),
@@ -99,13 +97,13 @@ class Ell : public ::testing::Test {
     {}
 
     std::shared_ptr<const gko::ReferenceExecutor> exec;
-    std::unique_ptr<Mtx> mtx_0;
+    std::unique_ptr<BMtx> mtx_0;
     std::unique_ptr<EllMtx> mtx_00;
     std::unique_ptr<EllMtx> mtx_01;
-    std::unique_ptr<MVec> b_0;
+    std::unique_ptr<BMVec> b_0;
     std::unique_ptr<DenseMtx> b_00;
     std::unique_ptr<DenseMtx> b_01;
-    std::unique_ptr<MVec> x_0;
+    std::unique_ptr<BMVec> x_0;
     std::unique_ptr<DenseMtx> x_00;
     std::unique_ptr<DenseMtx> x_01;
 
@@ -121,38 +119,10 @@ TYPED_TEST(Ell, AppliesToBatchMultiVector)
     using T = typename TestFixture::value_type;
 
     this->mtx_0->apply(this->b_0.get(), this->x_0.get());
+
     this->mtx_00->apply(this->b_00.get(), this->x_00.get());
     this->mtx_01->apply(this->b_01.get(), this->x_01.get());
-
-    auto res = gko::batch::unbatch<gko::batch::MultiVector<T>>(this->x_0.get());
-
-    GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.);
-    GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.);
-}
-
-
-TYPED_TEST(Ell, AppliesLinearCombinationWithSameAlphaToBatchMultiVector)
-{
-    using Mtx = typename TestFixture::Mtx;
-    using MVec = typename TestFixture::MVec;
-    using DenseMtx = typename TestFixture::DenseMtx;
-    using T = typename TestFixture::value_type;
-    auto alpha = gko::batch::initialize<MVec>(2, {1.5}, this->exec);
-    auto beta = gko::batch::initialize<MVec>(2, {-4.0}, this->exec);
-    auto alpha0 = gko::initialize<DenseMtx>({1.5}, this->exec);
-    auto alpha1 = gko::initialize<DenseMtx>({1.5}, this->exec);
-    auto beta0 = gko::initialize<DenseMtx>({-4.0}, this->exec);
-    auto beta1 = gko::initialize<DenseMtx>({-4.0}, this->exec);
-
-    this->mtx_0->apply(alpha.get(), this->b_0.get(), beta.get(),
-                       this->x_0.get());
-    this->mtx_00->apply(alpha0.get(), this->b_00.get(), beta0.get(),
-                        this->x_00.get());
-    this->mtx_01->apply(alpha1.get(), this->b_01.get(), beta1.get(),
-                        this->x_01.get());
-
     auto res = gko::batch::unbatch<gko::batch::MultiVector<T>>(this->x_0.get());
-
     GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.);
     GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.);
 }
@@ -160,12 +130,12 @@ TYPED_TEST(Ell, AppliesLinearCombinationWithSameAlphaToBatchMultiVector)
 
 TYPED_TEST(Ell, AppliesLinearCombinationToBatchMultiVector)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using MVec = typename TestFixture::MVec;
+    using BMtx = typename TestFixture::BMtx;
+    using BMVec = typename TestFixture::BMVec;
     using DenseMtx = typename TestFixture::DenseMtx;
     using T = typename TestFixture::value_type;
-    auto alpha = gko::batch::initialize<MVec>({{1.5}, {-1.0}}, this->exec);
-    auto beta = gko::batch::initialize<MVec>({{2.5}, {-4.0}}, this->exec);
+    auto alpha = gko::batch::initialize<BMVec>({{1.5}, {-1.0}}, this->exec);
+    auto beta = gko::batch::initialize<BMVec>({{2.5}, {-4.0}}, this->exec);
     auto alpha0 = gko::initialize<DenseMtx>({1.5}, this->exec);
     auto alpha1 = gko::initialize<DenseMtx>({-1.0}, this->exec);
     auto beta0 = gko::initialize<DenseMtx>({2.5}, this->exec);
@@ -173,13 +143,12 @@ TYPED_TEST(Ell, AppliesLinearCombinationToBatchMultiVector)
 
     this->mtx_0->apply(alpha.get(), this->b_0.get(), beta.get(),
                        this->x_0.get());
+
     this->mtx_00->apply(alpha0.get(), this->b_00.get(), beta0.get(),
                         this->x_00.get());
     this->mtx_01->apply(alpha1.get(), this->b_01.get(), beta1.get(),
                         this->x_01.get());
-
     auto res = gko::batch::unbatch<gko::batch::MultiVector<T>>(this->x_0.get());
-
     GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.);
     GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.);
 }
@@ -187,8 +156,8 @@ TYPED_TEST(Ell, AppliesLinearCombinationToBatchMultiVector)
 
 TYPED_TEST(Ell, ApplyFailsOnWrongNumberOfResultCols)
 {
-    using MVec = typename TestFixture::MVec;
-    auto res = MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2}});
+    using BMVec = typename TestFixture::BMVec;
+    auto res = BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2}});
 
     ASSERT_THROW(this->mtx_0->apply(this->b_0.get(), res.get()),
                  gko::DimensionMismatch);
@@ -197,8 +166,8 @@ TYPED_TEST(Ell, ApplyFailsOnWrongNumberOfResultCols)
 
 TYPED_TEST(Ell, ApplyFailsOnWrongNumberOfResultRows)
 {
-    using MVec = typename TestFixture::MVec;
-    auto res = MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3}});
+    using BMVec = typename TestFixture::BMVec;
+    auto res = BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3}});
 
     ASSERT_THROW(this->mtx_0->apply(this->b_0.get(), res.get()),
                  gko::DimensionMismatch);
@@ -207,9 +176,9 @@ TYPED_TEST(Ell, ApplyFailsOnWrongNumberOfResultRows)
 
 TYPED_TEST(Ell, ApplyFailsOnWrongInnerDimension)
 {
-    using MVec = typename TestFixture::MVec;
+    using BMVec = typename TestFixture::BMVec;
     auto res =
-        MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}});
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}});
 
     ASSERT_THROW(this->mtx_0->apply(res.get(), this->x_0.get()),
                  gko::DimensionMismatch);
@@ -218,13 +187,13 @@ TYPED_TEST(Ell, ApplyFailsOnWrongInnerDimension)
 
 TYPED_TEST(Ell, AdvancedApplyFailsOnWrongInnerDimension)
 {
-    using MVec = typename TestFixture::MVec;
+    using BMVec = typename TestFixture::BMVec;
     auto res =
-        MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}});
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}});
     auto alpha =
-        MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}});
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}});
     auto beta =
-        MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}});
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}});
 
     ASSERT_THROW(
         this->mtx_0->apply(alpha.get(), res.get(), beta.get(), this->x_0.get()),
@@ -234,13 +203,13 @@ TYPED_TEST(Ell, AdvancedApplyFailsOnWrongInnerDimension)
 
 TYPED_TEST(Ell, AdvancedApplyFailsOnWrongAlphaDimension)
 {
-    using MVec = typename TestFixture::MVec;
+    using BMVec = typename TestFixture::BMVec;
     auto res =
-        MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3, 3}});
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3, 3}});
     auto alpha =
-        MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 1}});
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 1}});
     auto beta =
-        MVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}});
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}});
 
     ASSERT_THROW(
         this->mtx_0->apply(alpha.get(), res.get(), beta.get(), this->x_0.get()),
diff --git a/test/matrix/batch_ell_kernels.cpp b/test/matrix/batch_ell_kernels.cpp
index bc1e0c7fb42..083af0a0938 100644
--- a/test/matrix/batch_ell_kernels.cpp
+++ b/test/matrix/batch_ell_kernels.cpp
@@ -55,18 +55,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 class Ell : public CommonTestFixture {
 protected:
-    using Mtx = gko::batch::matrix::Ell<value_type>;
-    using MVec = gko::batch::MultiVector<value_type>;
+    using BMtx = gko::batch::matrix::Ell<value_type>;
+    using BMVec = gko::batch::MultiVector<value_type>;
 
     Ell() : rand_engine(15) {}
 
-    template <typename MtxType>
-    std::unique_ptr<MtxType> gen_mtx(const gko::size_type num_batch_items,
-                                     gko::size_type num_rows,
-                                     gko::size_type num_cols,
-                                     int num_elems_per_row)
+    template <typename BMtxType>
+    std::unique_ptr<BMtxType> gen_mtx(const gko::size_type num_batch_items,
+                                      gko::size_type num_rows,
+                                      gko::size_type num_cols,
+                                      int num_elems_per_row)
     {
-        return gko::test::generate_random_batch_matrix<MtxType>(
+        return gko::test::generate_random_batch_matrix<BMtxType>(
             num_batch_items, num_rows, num_cols,
             std::uniform_int_distribution<>(num_elems_per_row,
                                             num_elems_per_row),
@@ -74,11 +74,11 @@ class Ell : public CommonTestFixture {
             num_elems_per_row);
     }
 
-    std::unique_ptr<MVec> gen_mvec(const gko::size_type num_batch_items,
-                                   gko::size_type num_rows,
-                                   gko::size_type num_cols)
+    std::unique_ptr<BMVec> gen_mvec(const gko::size_type num_batch_items,
+                                    gko::size_type num_rows,
+                                    gko::size_type num_cols)
     {
-        return gko::test::generate_random_batch_matrix<MVec>(
+        return gko::test::generate_random_batch_matrix<BMVec>(
             num_batch_items, num_rows, num_cols,
             std::uniform_int_distribution<>(num_cols, num_cols),
             std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
@@ -89,15 +89,16 @@ class Ell : public CommonTestFixture {
     {
         const int num_rows = 252;
         const int num_cols = 32;
-        x = gen_mtx<Mtx>(batch_size, num_rows, num_cols, num_elems_per_row);
+        GKO_ASSERT(num_elems_per_row <= num_cols);
+        mat = gen_mtx<BMtx>(batch_size, num_rows, num_cols, num_elems_per_row);
         y = gen_mvec(batch_size, num_cols, num_vecs);
         alpha = gen_mvec(batch_size, 1, 1);
         beta = gen_mvec(batch_size, 1, 1);
-        dx = gko::clone(exec, x);
+        dmat = gko::clone(exec, mat);
         dy = gko::clone(exec, y);
         dalpha = gko::clone(exec, alpha);
         dbeta = gko::clone(exec, beta);
-        expected = MVec::create(
+        expected = BMVec::create(
             ref,
             gko::batch_dim<2>(batch_size, gko::dim<2>{num_rows, num_vecs}));
         expected->fill(gko::one<value_type>());
@@ -107,16 +108,16 @@ class Ell : public CommonTestFixture {
     std::ranlux48 rand_engine;
 
     const size_t batch_size = 11;
-    std::unique_ptr<Mtx> x;
-    std::unique_ptr<MVec> y;
-    std::unique_ptr<MVec> alpha;
-    std::unique_ptr<MVec> beta;
-    std::unique_ptr<MVec> expected;
-    std::unique_ptr<MVec> dresult;
-    std::unique_ptr<Mtx> dx;
-    std::unique_ptr<MVec> dy;
-    std::unique_ptr<MVec> dalpha;
-    std::unique_ptr<MVec> dbeta;
+    std::unique_ptr<BMtx> mat;
+    std::unique_ptr<BMVec> y;
+    std::unique_ptr<BMVec> alpha;
+    std::unique_ptr<BMVec> beta;
+    std::unique_ptr<BMVec> expected;
+    std::unique_ptr<BMVec> dresult;
+    std::unique_ptr<BMtx> dmat;
+    std::unique_ptr<BMVec> dy;
+    std::unique_ptr<BMVec> dalpha;
+    std::unique_ptr<BMVec> dbeta;
 };
 
 
@@ -124,8 +125,8 @@ TEST_F(Ell, SingleVectorApplyIsEquivalentToRef)
 {
     set_up_apply_data(1);
 
-    x->apply(y.get(), expected.get());
-    dx->apply(dy.get(), dresult.get());
+    mat->apply(y.get(), expected.get());
+    dmat->apply(dy.get(), dresult.get());
 
     GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, r<value_type>::value);
 }
@@ -135,8 +136,8 @@ TEST_F(Ell, SingleVectorAdvancedApplyIsEquivalentToRef)
 {
     set_up_apply_data(1);
 
-    x->apply(alpha.get(), y.get(), beta.get(), expected.get());
-    dx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+    mat->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmat->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
 
     GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, r<value_type>::value);
 }
diff --git a/test/test_install/test_install.cpp b/test/test_install/test_install.cpp
index 7e53ea8f165..c00bb594ecd 100644
--- a/test/test_install/test_install.cpp
+++ b/test/test_install/test_install.cpp
@@ -219,13 +219,20 @@ int main()
         auto test = batch_multi_vector_type::create(exec);
     }
 
-    // core/base/batch_dense.hpp
+    // core/matrix/batch_dense.hpp
     {
         using type1 = float;
         using batch_dense_type = gko::batch::matrix::Dense<type1>;
         auto test = batch_dense_type::create(exec);
     }
 
+    // core/matrix/batch_ell.hpp
+    {
+        using type1 = float;
+        using batch_ell_type = gko::batch::matrix::Ell<type1>;
+        auto test = batch_ell_type::create(exec);
+    }
+
     // core/base/combination.hpp
     {
         using type1 = int;

From b78c0cc422aaff4850e1b35a1ef1e18ae405f04b Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Wed, 11 Oct 2023 14:07:13 +0200
Subject: [PATCH 10/18] Kernel updates and batch_random_matrix gen

---
 .../cuda_hip/matrix/batch_ell_kernels.hpp.inc |  4 +-
 core/matrix/batch_ell.cpp                     |  7 --
 core/test/utils/batch_helpers.hpp             | 17 +++-
 core/test/utils/matrix_generator.hpp          | 90 +++++++++++++++++++
 cuda/matrix/batch_ell_kernels.cu              |  1 +
 dpcpp/matrix/batch_ell_kernels.hpp.inc        | 57 ++++++------
 hip/matrix/batch_ell_kernels.hip.cpp          |  1 +
 include/ginkgo/core/matrix/batch_ell.hpp      | 19 ++--
 test/matrix/batch_ell_kernels.cpp             |  2 +-
 9 files changed, 149 insertions(+), 49 deletions(-)

diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
index 5c00358c5a0..19c29f14aa8 100644
--- a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
@@ -46,7 +46,7 @@ __device__ __forceinline__ void simple_apply(
         for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
             const auto ind = tidx + idx * stride;
             const auto col_idx = col[ind];
-            if (col_idx < idx) {
+            if (col_idx == invalid_index<int>()) {
                 break;
             } else {
                 temp += val[ind] * b[col_idx];
@@ -102,7 +102,7 @@ __device__ __forceinline__ void advanced_apply(
         for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
             const auto ind = tidx + idx * stride;
             const auto col_idx = col[ind];
-            if (col_idx < idx) {
+            if (col_idx == invalid_index<int>()) {
                 break;
             } else {
                 temp += alpha * val[ind] * b[col_idx];
diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp
index f421fdf2b49..c9dbe6d51c9 100644
--- a/core/matrix/batch_ell.cpp
+++ b/core/matrix/batch_ell.cpp
@@ -128,13 +128,6 @@ Ell<ValueType, IndexType>::create_const(
 }
 
 
-inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes)
-{
-    return batch_dim<2>(sizes.get_num_batch_items(),
-                        dim<2>(1, sizes.get_common_size()[1]));
-}
-
-
 template <typename ValueType, typename IndexType>
 Ell<ValueType, IndexType>::Ell(std::shared_ptr<const Executor> exec,
                                const batch_dim<2>& size,
diff --git a/core/test/utils/batch_helpers.hpp b/core/test/utils/batch_helpers.hpp
index b040691999e..0b6197b5062 100644
--- a/core/test/utils/batch_helpers.hpp
+++ b/core/test/utils/batch_helpers.hpp
@@ -82,11 +82,22 @@ std::unique_ptr<MatrixType> generate_random_batch_matrix(
     auto result = MatrixType::create(
         exec, batch_dim<2>(num_batch_items, dim<2>(num_rows, num_cols)),
         std::forward<MatrixArgs>(args)...);
+    auto sp_mat = generate_random_device_matrix_data<value_type, index_type>(
+        num_rows, num_cols, nonzero_dist, value_dist, engine,
+        exec->get_master());
+    auto row_idxs = gko::array<index_type>::const_view(
+                        exec->get_master(), sp_mat.get_num_elems(),
+                        sp_mat.get_const_row_idxs())
+                        .copy_to_array();
+    auto col_idxs = gko::array<index_type>::const_view(
+                        exec->get_master(), sp_mat.get_num_elems(),
+                        sp_mat.get_const_col_idxs())
+                        .copy_to_array();
 
     for (size_type b = 0; b < num_batch_items; b++) {
-        auto rand_mat =
-            generate_random_matrix<typename MatrixType::unbatch_type>(
-                num_rows, num_cols, nonzero_dist, value_dist, engine, exec);
+        auto rand_mat = fill_random_matrix_with_sparsity_pattern<
+            typename MatrixType::unbatch_type, index_type>(
+            num_rows, num_cols, row_idxs, col_idxs, value_dist, engine, exec);
         result->create_view_for_item(b)->copy_from(rand_mat.get());
     }
 
diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp
index 6928c5424a5..8a82ae744e7 100644
--- a/core/test/utils/matrix_generator.hpp
+++ b/core/test/utils/matrix_generator.hpp
@@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <vector>
 
 
+#include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
@@ -54,6 +55,49 @@ namespace gko {
 namespace test {
 
 
+/**
+ * Fills matrix data for a random matrix given a sparsity pattern
+ *
+ * @tparam ValueType  the type for matrix values
+ * @tparam IndexType  the type for row and column indices
+ * @tparam ValueDistribution  type of value distribution
+ * @tparam Engine  type of random engine
+ *
+ * @param num_rows  number of rows
+ * @param num_cols  number of columns
+ * @param row_idxs  the row indices of the matrix
+ * @param col_idxs  the column indices of the matrix
+ * @param value_dist  distribution of matrix values
+ * @param engine  a random engine
+ *
+ * @return the generated matrix_data with entries according to the given
+ *         dimensions and nonzero count and value distributions.
+ */
+template <typename ValueType, typename IndexType, typename ValueDistribution,
+          typename Engine>
+matrix_data<ValueType, IndexType> fill_random_matrix_data(
+    size_type num_rows, size_type num_cols,
+    const gko::array<IndexType>& row_indices,
+    const gko::array<IndexType>& col_indices, ValueDistribution&& value_dist,
+    Engine&& engine)
+{
+    matrix_data<ValueType, IndexType> data{gko::dim<2>{num_rows, num_cols}, {}};
+    auto host_exec = row_indices.get_executor()->get_master();
+    auto host_row_indices = make_temporary_clone(host_exec, &row_indices);
+    auto host_col_indices = make_temporary_clone(host_exec, &col_indices);
+
+    for (int nnz = 0; nnz < row_indices.get_num_elems(); ++nnz) {
+        data.nonzeros.emplace_back(
+            host_row_indices->get_const_data()[nnz],
+            host_col_indices->get_const_data()[nnz],
+            detail::get_rand_value<ValueType>(value_dist, engine));
+    }
+
+    data.ensure_row_major_order();
+    return data;
+}
+
+
 /**
  * Generates matrix data for a random matrix.
  *
@@ -156,6 +200,48 @@ generate_random_device_matrix_data(gko::size_type num_rows,
 }
 
 
+/**
+ * Fills a random matrix with given sparsity pattern.
+ *
+ * @tparam MatrixType  type of matrix to generate (must implement
+ *                     the interface `ReadableFromMatrixData<>` and provide
+ *                     matching `value_type` and `index_type` type aliases)
+ *
+ * @param num_rows  number of rows
+ * @param num_cols  number of columns
+ * @param value_dist  distribution of matrix values
+ * @param row_idxs  the row indices of the matrix
+ * @param col_idxs  the column indices of the matrix
+ * @param exec  executor where the matrix should be allocated
+ * @param args  additional arguments for the matrix constructor
+ *
+ * The other (template) parameters match generate_random_matrix_data.
+ *
+ * @return the unique pointer of MatrixType
+ */
+template <typename MatrixType = matrix::Dense<>,
+          typename IndexType = typename MatrixType::index_type,
+          typename ValueDistribution, typename Engine, typename... MatrixArgs>
+std::unique_ptr<MatrixType> fill_random_matrix_with_sparsity_pattern(
+    size_type num_rows, size_type num_cols,
+    const gko::array<IndexType>& row_idxs,
+    const gko::array<IndexType>& col_idxs, ValueDistribution&& value_dist,
+    Engine&& engine, std::shared_ptr<const Executor> exec, MatrixArgs&&... args)
+{
+    using value_type = typename MatrixType::value_type;
+    using index_type = IndexType;
+
+    GKO_ASSERT(row_idxs.get_num_elems() == col_idxs.get_num_elems());
+    GKO_ASSERT(row_idxs.get_num_elems() < (num_rows * num_cols));
+    auto result = MatrixType::create(exec, std::forward<MatrixArgs>(args)...);
+    result->read(fill_random_matrix_data<value_type, index_type>(
+        num_rows, num_cols, row_idxs, col_idxs,
+        std::forward<ValueDistribution>(value_dist),
+        std::forward<Engine>(engine)));
+    return result;
+}
+
+
 /**
  * Generates a random matrix.
  *
@@ -163,6 +249,10 @@ generate_random_device_matrix_data(gko::size_type num_rows,
  *                     the interface `ReadableFromMatrixData<>` and provide
  *                     matching `value_type` and `index_type` type aliases)
  *
+ * @param num_rows  number of rows
+ * @param num_cols  number of columns
+ * @param nonzero_dist  distribution of nonzeros per row
+ * @param value_dist  distribution of matrix values
  * @param exec  executor where the matrix should be allocated
  * @param args  additional arguments for the matrix constructor
  *
diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu
index 6dd268a2d8e..5cadd7755a2 100644
--- a/cuda/matrix/batch_ell_kernels.cu
+++ b/cuda/matrix/batch_ell_kernels.cu
@@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
 
diff --git a/dpcpp/matrix/batch_ell_kernels.hpp.inc b/dpcpp/matrix/batch_ell_kernels.hpp.inc
index 7500ae9e060..e6501bafaba 100644
--- a/dpcpp/matrix/batch_ell_kernels.hpp.inc
+++ b/dpcpp/matrix/batch_ell_kernels.hpp.inc
@@ -42,38 +42,37 @@ __dpct_inline__ void simple_apply_kernel(
         auto temp = zero<ValueType>();
         for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) {
             const auto col_idx = mat.col_idxs[tidx + idx * mat.stride];
-            if (col_idx < idx)
+            if (col_idx == invalid_index<int>()) {
                 break;
-            else
-                temp += mat.values[tidx + idx * mat.stride] *
-                        b.values[col_idx * b.stride];
+                else temp += mat.values[tidx + idx * mat.stride] *
+                             b.values[col_idx * b.stride];
+            }
+            x.values[tidx * x.stride] = temp;
         }
-        x.values[tidx * x.stride] = temp;
     }
-}
 
 
-template <typename ValueType>
-__dpct_inline__ void advanced_apply_kernel(
-    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
-    const gko::batch::matrix::ell::batch_item<const ValueType>& mat,
-    const gko::batch::multi_vector::batch_item<const ValueType>& b,
-    const gko::batch::multi_vector::batch_item<const ValueType>& beta,
-    const gko::batch::multi_vector::batch_item<ValueType>& x,
-    sycl::nd_item<3>& item_ct1)
-{
-    for (int tidx = item_ct1.get_local_linear_id(); tidx < mat.num_rows;
-         tidx += item_ct1.get_local_range().size()) {
-        auto temp = zero<ValueType>();
-        for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) {
-            const auto col_idx = mat.col_idxs[tidx + idx * mat.stride];
-            if (col_idx < idx)
-                break;
-            else
-                temp += alpha.values[0] * mat.values[tidx + idx * mat.stride] *
-                        b.values[col_idx * b.stride];
+    template <typename ValueType>
+    __dpct_inline__ void advanced_apply_kernel(
+        const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
+        const gko::batch::matrix::ell::batch_item<const ValueType>& mat,
+        const gko::batch::multi_vector::batch_item<const ValueType>& b,
+        const gko::batch::multi_vector::batch_item<const ValueType>& beta,
+        const gko::batch::multi_vector::batch_item<ValueType>& x,
+        sycl::nd_item<3>& item_ct1)
+    {
+        for (int tidx = item_ct1.get_local_linear_id(); tidx < mat.num_rows;
+             tidx += item_ct1.get_local_range().size()) {
+            auto temp = zero<ValueType>();
+            for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) {
+                const auto col_idx = mat.col_idxs[tidx + idx * mat.stride];
+                if (col_idx == invalid_index<int>()) {
+                    break;
+                    else temp += alpha.values[0] *
+                                 mat.values[tidx + idx * mat.stride] *
+                                 b.values[col_idx * b.stride];
+                }
+                x.values[tidx * x.stride] =
+                    temp + beta.values[0] * x.values[tidx * x.stride];
+            }
         }
-        x.values[tidx * x.stride] =
-            temp + beta.values[0] * x.values[tidx * x.stride];
-    }
-}
diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp
index 5c6d5179a21..96e7cdb298e 100644
--- a/hip/matrix/batch_ell_kernels.hip.cpp
+++ b/hip/matrix/batch_ell_kernels.hip.cpp
@@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
 
diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp
index 6f3db1bb96b..be49e2cff41 100644
--- a/include/ginkgo/core/matrix/batch_ell.hpp
+++ b/include/ginkgo/core/matrix/batch_ell.hpp
@@ -58,9 +58,14 @@ namespace matrix {
  * Ell is a sparse matrix format that stores the same number of nonzeros in each
  * row, enabling coalesced accesses. It is suitable for sparsity patterns that
  * have a similar number of nonzeros in every row. The values are stored in a
- * column-major fashion similar to the monolithic gko::matrix::Ell class. It is
- * also assumed that the sparsity pattern of all the items in the batch is the
- * same and therefore only a single copy of the sparsity pattern is stored.
+ * column-major fashion similar to the monolithic gko::matrix::Ell class.
+ *
+ * Similar to the monolithic gko::matrix::Ell class, invalid_index<IndexType> is
+ * used as the column index for padded zero entries.
+ *
+ * @note It is also assumed that the sparsity pattern of all the items in the
+ * batch is the same and therefore only a single copy of the sparsity pattern is
+ * stored.
  *
  * @tparam ValueType  value precision of matrix elements
  * @tparam IndexType  index precision of matrix elements
@@ -253,13 +258,13 @@ class Ell final
 
     /**
      * Creates a constant (immutable) batch ell matrix from a constant
-     * array.
+     * array. The column indices array needs to be the same for all batch items.
      *
      * @param exec  the executor to create the matrix on
      * @param size  the dimensions of the matrix
      * @param num_elems_per_row  the number of elements to be stored in each row
      * @param values  the value array of the matrix
-     * @param col_idxs the col_idxs array of the matrix
+     * @param col_idxs the col_idxs array of a single batch item of the matrix.
      *
      * @return A smart pointer to the constant matrix wrapping the input
      * array (if it resides on the same executor as the matrix) or a copy of the
@@ -325,7 +330,7 @@ class Ell final
 
     /**
      * Creates a Ell matrix from an already allocated (and initialized)
-     * array.
+     * array. The column indices array needs to be the same for all batch items.
      *
      * @tparam ValuesArray  type of array of values
      *
@@ -333,7 +338,7 @@ class Ell final
      * @param size  size of the matrix
      * @param num_elems_per_row  the number of elements to be stored in each row
      * @param values  array of matrix values
-     * @param col_idxs the col_idxs array of the matrix
+     * @param col_idxs the col_idxs array of a single batch item of the matrix.
      *
      * @note If `values` is not an rvalue, not an array of ValueType, or is on
      *       the wrong executor, an internal copy will be created, and the
diff --git a/test/matrix/batch_ell_kernels.cpp b/test/matrix/batch_ell_kernels.cpp
index 083af0a0938..572f47ba47d 100644
--- a/test/matrix/batch_ell_kernels.cpp
+++ b/test/matrix/batch_ell_kernels.cpp
@@ -55,7 +55,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 class Ell : public CommonTestFixture {
 protected:
-    using BMtx = gko::batch::matrix::Ell<value_type>;
+    using BMtx = gko::batch::matrix::Ell<value_type, gko::int32>;
     using BMVec = gko::batch::MultiVector<value_type>;
 
     Ell() : rand_engine(15) {}

From 4179654e60313541032435ae52557785787a8001 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Wed, 11 Oct 2023 15:41:55 +0200
Subject: [PATCH 11/18] Review updates

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
Co-authored-by: Yu-Hsiang Tsai <yhmtsai@gmail.com>
---
 .../cuda_hip/matrix/batch_ell_kernels.hpp.inc |  22 +--
 core/base/batch_multi_vector.cpp              |  21 ---
 core/base/batch_utilities.hpp                 |  47 ++---
 core/matrix/batch_struct.hpp                  |  40 ++---
 core/test/matrix/batch_ell.cpp                | 160 ++++++++----------
 core/test/utils/matrix_generator.hpp          |   2 +-
 cuda/matrix/batch_struct.hpp                  |  28 +--
 dpcpp/matrix/batch_ell_kernels.dp.cpp         |   2 +
 dpcpp/matrix/batch_ell_kernels.hpp.inc        |  61 +++----
 dpcpp/matrix/batch_struct.hpp                 |  28 +--
 hip/matrix/batch_struct.hip.hpp               |  28 +--
 .../ginkgo/core/base/batch_multi_vector.hpp   |  18 +-
 include/ginkgo/core/matrix/batch_dense.hpp    |   2 -
 include/ginkgo/core/matrix/batch_ell.hpp      |   7 +-
 reference/matrix/batch_ell_kernels.hpp.inc    |  24 +--
 reference/matrix/batch_struct.hpp             |  28 +--
 reference/test/matrix/batch_ell_kernels.cpp   |   8 +-
 17 files changed, 235 insertions(+), 291 deletions(-)

diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
index 19c29f14aa8..de6ca879890 100644
--- a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
@@ -31,9 +31,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
 
-template <typename ValueType>
+template <typename ValueType, typename IndexType>
 __device__ __forceinline__ void simple_apply(
-    const gko::batch::matrix::ell::batch_item<const ValueType>& mat,
+    const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& mat,
     const ValueType* const __restrict__ b, ValueType* const __restrict__ x)
 {
     const auto num_rows = mat.num_rows;
@@ -46,7 +46,7 @@ __device__ __forceinline__ void simple_apply(
         for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
             const auto ind = tidx + idx * stride;
             const auto col_idx = col[ind];
-            if (col_idx == invalid_index<int>()) {
+            if (col_idx == invalid_index<IndexType>()) {
                 break;
             } else {
                 temp += val[ind] * b[col_idx];
@@ -56,12 +56,13 @@ __device__ __forceinline__ void simple_apply(
     }
 }
 
-template <typename ValueType>
+template <typename ValueType, typename IndexType>
 __global__ __launch_bounds__(
     default_block_size,
     sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix::
                                                       ell::uniform_batch<
-                                                          const ValueType>
+                                                          const ValueType,
+                                                          IndexType>
                                                           mat,
                                                   const gko::batch::
                                                       multi_vector::
@@ -85,10 +86,10 @@ __global__ __launch_bounds__(
 }
 
 
-template <typename ValueType>
+template <typename ValueType, typename IndexType>
 __device__ __forceinline__ void advanced_apply(
     const ValueType alpha,
-    const gko::batch::matrix::ell::batch_item<const ValueType>& mat,
+    const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& mat,
     const ValueType* const __restrict__ b, const ValueType beta,
     ValueType* const __restrict__ x)
 {
@@ -102,7 +103,7 @@ __device__ __forceinline__ void advanced_apply(
         for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
             const auto ind = tidx + idx * stride;
             const auto col_idx = col[ind];
-            if (col_idx == invalid_index<int>()) {
+            if (col_idx == invalid_index<IndexType>()) {
                 break;
             } else {
                 temp += alpha * val[ind] * b[col_idx];
@@ -112,7 +113,7 @@ __device__ __forceinline__ void advanced_apply(
     }
 }
 
-template <typename ValueType>
+template <typename ValueType, typename IndexType>
 __global__ __launch_bounds__(
     default_block_size,
     sm_oversubscription) void advanced_apply_kernel(const gko::batch::
@@ -122,7 +123,8 @@ __global__ __launch_bounds__(
                                                                 alpha,
                                                     const gko::batch::matrix::
                                                         ell::uniform_batch<
-                                                            const ValueType>
+                                                            const ValueType,
+                                                            IndexType>
                                                             mat,
                                                     const gko::batch::
                                                         multi_vector::
diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp
index 6a14919bf2f..6dcf8dd90b5 100644
--- a/core/base/batch_multi_vector.cpp
+++ b/core/base/batch_multi_vector.cpp
@@ -291,27 +291,6 @@ void MultiVector<ValueType>::move_to(
 }
 
 
-template <typename ValueType>
-void MultiVector<ValueType>::convert_to(matrix::Dense<ValueType>* result) const
-{
-    auto exec = result->get_executor() == nullptr ? this->get_executor()
-                                                  : result->get_executor();
-    auto tmp = gko::batch::matrix::Dense<ValueType>::create_const(
-        exec, this->get_size(),
-        make_const_array_view(this->get_executor(),
-                              this->get_num_stored_elements(),
-                              this->get_const_values()));
-    result->copy_from(tmp);
-}
-
-
-template <typename ValueType>
-void MultiVector<ValueType>::move_to(matrix::Dense<ValueType>* result)
-{
-    this->convert_to(result);
-}
-
-
 #define GKO_DECLARE_BATCH_MULTI_VECTOR(_type) class MultiVector<_type>
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR);
 
diff --git a/core/base/batch_utilities.hpp b/core/base/batch_utilities.hpp
index c37c0cae721..7204c78a552 100644
--- a/core/base/batch_utilities.hpp
+++ b/core/base/batch_utilities.hpp
@@ -165,12 +165,8 @@ std::vector<gko::matrix_data<ValueType, IndexType>> write(
 /**
  * Creates and initializes a batch of single column-vectors.
  *
- * This function first creates a temporary MultiVector, fills it with
- * passed in values, and then converts the vector to the requested type.
- *
- * @tparam Matrix  matrix type to initialize
- *                 (MultiVector has to implement the ConvertibleTo<Matrix>
- *                 interface)
+ * @tparam Matrix  matrix type to initialize (It has to implement the
+ *                 read<Matrix> function)
  * @tparam TArgs  argument types for Matrix::create method
  *                (not including the implied Executor as the first argument)
  *
@@ -180,7 +176,6 @@ std::vector<gko::matrix_data<ValueType, IndexType>> write(
  *                     including the Executor, which is passed as the first
  *                     argument
  *
- * @ingroup MultiVector
  * @ingroup mat_formats
  */
 template <typename Matrix, typename... TArgs>
@@ -220,23 +215,19 @@ std::unique_ptr<Matrix> initialize(
 
 
 /**
- * Creates and initializes a batch of multi-vectors.
- *
- * This function first creates a temporary MultiVector, fills it with
- * passed in values, and then converts the vector to the requested type.
+ * Creates and initializes a batch of matrices.
  *
- * @tparam Matrix  matrix type to initialize
- *                 (Dense has to implement the ConvertibleTo<Matrix> interface)
+ * @tparam Matrix  matrix type to initialize (It has to implement the
+ *                 read<Matrix> function)
  * @tparam TArgs  argument types for Matrix::create method
  *                (not including the implied Executor as the first argument)
  *
- * @param vals  values used to initialize the vector
- * @param exec  Executor associated to the vector
+ * @param vals  values used to initialize the matrix
+ * @param exec  Executor associated with the matrix
  * @param create_args  additional arguments passed to Matrix::create, not
  *                     including the Executor, which is passed as the first
  *                     argument
  *
- * @ingroup MultiVector
  * @ingroup mat_formats
  */
 template <typename Matrix, typename... TArgs>
@@ -290,23 +281,18 @@ std::unique_ptr<Matrix> initialize(
  * Creates and initializes a batch single column-vector by making copies of the
  * single input column vector.
  *
- * This function first creates a temporary batch multi-vector, fills it with
- * passed in values, and then converts the vector to the requested type.
- *
- * @tparam Matrix  matrix type to initialize
- *                 (MultiVector has to implement the ConvertibleTo<Matrix>
- *                  interface)
+ * @tparam Matrix  matrix type to initialize (It has to implement the
+ *                 read<Matrix> function)
  * @tparam TArgs  argument types for Matrix::create method
  *                (not including the implied Executor as the first argument)
  *
  * @param num_vectors  The number of times the input vector is to be duplicated
  * @param vals  values used to initialize each vector in the temp. batch
- * @param exec  Executor associated to the vector
+ * @param exec  Executor associated with the matrix
  * @param create_args  additional arguments passed to Matrix::create, not
  *                     including the Executor, which is passed as the first
  *                     argument
  *
- * @ingroup MultiVector
  * @ingroup mat_formats
  */
 template <typename Matrix, typename... TArgs>
@@ -343,23 +329,18 @@ std::unique_ptr<Matrix> initialize(
 /**
  * Creates and initializes a matrix from copies of a given matrix.
  *
- * This function first creates a temporary batch multi-vector, fills it with
- * passed in values, and then converts the vector to the requested type.
- *
- * @tparam Matrix  matrix type to initialize
- *                 (MultiVector has to implement the ConvertibleTo<Matrix>
- *                  interface)
+ * @tparam Matrix  matrix type to initialize (It has to implement the
+ *                 read<Matrix> function)
  * @tparam TArgs  argument types for Matrix::create method
  *                (not including the implied Executor as the first argument)
  *
  * @param num_batch_items The number of times the input matrix is duplicated
- * @param vals  values used to initialize each vector in the temp. batch
- * @param exec  Executor associated to the vector
+ * @param vals  values used to initialize each matrix in the temp. batch
+ * @param exec  Executor associated to the matrix
  * @param create_args  additional arguments passed to Matrix::create, not
  *                     including the Executor, which is passed as the first
  *                     argument
  *
- * @ingroup LinOp
  * @ingroup mat_formats
  */
 template <typename Matrix, typename... TArgs>
diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp
index eeeeebd53d6..f208f5ff078 100644
--- a/core/matrix/batch_struct.hpp
+++ b/core/matrix/batch_struct.hpp
@@ -89,10 +89,10 @@ namespace ell {
 /**
  * Encapsulates one matrix from a batch of ell matrices.
  */
-template <typename ValueType>
+template <typename ValueType, typename IndexType>
 struct batch_item {
     using value_type = ValueType;
-    using index_type = int32;
+    using index_type = IndexType;
 
     ValueType* values;
     const index_type* col_idxs;
@@ -106,11 +106,11 @@ struct batch_item {
 /**
  * A 'simple' structure to store a global uniform batch of ell matrices.
  */
-template <typename ValueType>
+template <typename ValueType, typename IndexType>
 struct uniform_batch {
     using value_type = ValueType;
-    using index_type = int32;
-    using entry_type = batch_item<value_type>;
+    using index_type = IndexType;
+    using entry_type = batch_item<value_type, index_type>;
 
     ValueType* values;
     const index_type* col_idxs;
@@ -164,27 +164,28 @@ GKO_ATTRIBUTES GKO_INLINE dense::batch_item<ValueType> extract_batch_item(
 }
 
 
-template <typename ValueType>
-GKO_ATTRIBUTES GKO_INLINE ell::batch_item<const ValueType> to_const(
-    const ell::batch_item<ValueType>& b)
+template <typename ValueType, typename IndexType>
+GKO_ATTRIBUTES GKO_INLINE ell::batch_item<const ValueType, IndexType> to_const(
+    const ell::batch_item<ValueType, IndexType>& b)
 {
     return {b.values,   b.col_idxs, b.stride,
             b.num_rows, b.num_cols, b.num_stored_elems_per_row};
 }
 
 
-template <typename ValueType>
-GKO_ATTRIBUTES GKO_INLINE ell::uniform_batch<const ValueType> to_const(
-    const ell::uniform_batch<ValueType>& ub)
+template <typename ValueType, typename IndexType>
+GKO_ATTRIBUTES GKO_INLINE ell::uniform_batch<const ValueType, IndexType>
+to_const(const ell::uniform_batch<ValueType, IndexType>& ub)
 {
     return {ub.values,   ub.col_idxs, ub.num_batch_items,         ub.stride,
             ub.num_rows, ub.num_cols, ub.num_stored_elems_per_row};
 }
 
 
-template <typename ValueType>
-GKO_ATTRIBUTES GKO_INLINE ell::batch_item<ValueType> extract_batch_item(
-    const ell::uniform_batch<ValueType>& batch, const size_type batch_idx)
+template <typename ValueType, typename IndexType>
+GKO_ATTRIBUTES GKO_INLINE ell::batch_item<ValueType, IndexType>
+extract_batch_item(const ell::uniform_batch<ValueType, IndexType>& batch,
+                   const size_type batch_idx)
 {
     return {batch.values +
                 batch_idx * batch.num_stored_elems_per_row * batch.num_rows,
@@ -195,11 +196,12 @@ GKO_ATTRIBUTES GKO_INLINE ell::batch_item<ValueType> extract_batch_item(
             batch.num_stored_elems_per_row};
 }
 
-template <typename ValueType>
-GKO_ATTRIBUTES GKO_INLINE ell::batch_item<ValueType> extract_batch_item(
-    ValueType* const batch_values, int* const batch_col_idxs, const int stride,
-    const int num_rows, const int num_cols, int num_elems_per_row,
-    const size_type batch_idx)
+template <typename ValueType, typename IndexType>
+GKO_ATTRIBUTES GKO_INLINE ell::batch_item<ValueType, IndexType>
+extract_batch_item(ValueType* const batch_values,
+                   IndexType* const batch_col_idxs, const int stride,
+                   const int num_rows, const int num_cols,
+                   int num_elems_per_row, const size_type batch_idx)
 {
     return {batch_values + batch_idx * num_elems_per_row * num_rows,
             batch_col_idxs,
diff --git a/core/test/matrix/batch_ell.cpp b/core/test/matrix/batch_ell.cpp
index e4dcab23917..c36a877ac14 100644
--- a/core/test/matrix/batch_ell.cpp
+++ b/core/test/matrix/batch_ell.cpp
@@ -38,7 +38,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/executor.hpp>
-#include <ginkgo/core/base/range.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 
 
@@ -52,26 +51,26 @@ class Ell : public ::testing::Test {
 protected:
     using value_type = T;
     using index_type = gko::int32;
-    using EllMtx = gko::matrix::Ell<value_type>;
+    using BatchEllMtx = gko::batch::matrix::Ell<value_type, index_type>;
+    using EllMtx = gko::matrix::Ell<value_type, index_type>;
     using size_type = gko::size_type;
     Ell()
         : exec(gko::ReferenceExecutor::create()),
-          mtx(gko::batch::initialize<gko::batch::matrix::Ell<value_type>>(
+          mtx(gko::batch::initialize<BatchEllMtx>(
               {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
                {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}},
               exec, 3)),
-          sp_mtx(gko::batch::initialize<gko::batch::matrix::Ell<value_type>>(
+          sp_mtx(gko::batch::initialize<BatchEllMtx>(
               {{{-1.0, 0.0, 0.0}, {0.0, 2.5, 3.5}},
                {{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}}},
               exec, 2)),
-          ell_mtx(gko::initialize<gko::matrix::Ell<value_type>>(
-              {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, exec, gko::dim<2>(2, 3), 3)),
-          sp_ell_mtx(gko::initialize<gko::matrix::Ell<value_type>>(
-              {{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}}, exec, gko::dim<2>(2, 3), 2))
+          ell_mtx(gko::initialize<EllMtx>({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}},
+                                          exec, gko::dim<2>(2, 3), 3)),
+          sp_ell_mtx(gko::initialize<EllMtx>({{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}},
+                                             exec, gko::dim<2>(2, 3), 2))
     {}
 
-    static void assert_equal_to_original_sparse_mtx(
-        const gko::batch::matrix::Ell<value_type>* m)
+    static void assert_equal_to_original_sparse_mtx(const BatchEllMtx* m)
     {
         ASSERT_EQ(m->get_num_batch_items(), 2);
         ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3));
@@ -91,8 +90,7 @@ class Ell : public ::testing::Test {
         ASSERT_EQ(m->get_const_col_idxs()[3], index_type{2});
     }
 
-    static void assert_equal_to_original_mtx(
-        const gko::batch::matrix::Ell<value_type>* m)
+    static void assert_equal_to_original_mtx(const BatchEllMtx* m)
     {
         ASSERT_EQ(m->get_num_batch_items(), 2);
         ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3));
@@ -112,7 +110,7 @@ class Ell : public ::testing::Test {
         ASSERT_EQ(m->get_const_values()[11], value_type{3.0});
     }
 
-    static void assert_empty(gko::batch::matrix::Ell<value_type>* m)
+    static void assert_empty(BatchEllMtx* m)
     {
         ASSERT_EQ(m->get_num_batch_items(), 0);
         ASSERT_EQ(m->get_num_stored_elements(), 0);
@@ -120,10 +118,10 @@ class Ell : public ::testing::Test {
     }
 
     std::shared_ptr<const gko::Executor> exec;
-    std::unique_ptr<gko::batch::matrix::Ell<value_type>> mtx;
-    std::unique_ptr<gko::batch::matrix::Ell<value_type>> sp_mtx;
-    std::unique_ptr<gko::matrix::Ell<value_type>> ell_mtx;
-    std::unique_ptr<gko::matrix::Ell<value_type>> sp_ell_mtx;
+    std::unique_ptr<BatchEllMtx> mtx;
+    std::unique_ptr<BatchEllMtx> sp_mtx;
+    std::unique_ptr<EllMtx> ell_mtx;
+    std::unique_ptr<EllMtx> sp_ell_mtx;
 };
 
 TYPED_TEST_SUITE(Ell, gko::test::ValueTypes);
@@ -143,16 +141,11 @@ TYPED_TEST(Ell, SparseMtxKnowsItsSizeAndValues)
 
 TYPED_TEST(Ell, CanBeEmpty)
 {
-    auto empty = gko::batch::matrix::Ell<TypeParam>::create(this->exec);
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
 
-    this->assert_empty(empty.get());
-}
-
-
-TYPED_TEST(Ell, ReturnsNullValuesArrayWhenEmpty)
-{
-    auto empty = gko::batch::matrix::Ell<TypeParam>::create(this->exec);
+    auto empty = BatchEllMtx::create(this->exec);
 
+    this->assert_empty(empty.get());
     ASSERT_EQ(empty->get_const_values(), nullptr);
 }
 
@@ -180,7 +173,9 @@ TYPED_TEST(Ell, CanCreateSpEllItemView)
 
 TYPED_TEST(Ell, CanBeCopied)
 {
-    auto mtx_copy = gko::batch::matrix::Ell<TypeParam>::create(this->exec);
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+
+    auto mtx_copy = BatchEllMtx::create(this->exec);
 
     mtx_copy->copy_from(this->mtx.get());
 
@@ -192,7 +187,9 @@ TYPED_TEST(Ell, CanBeCopied)
 
 TYPED_TEST(Ell, CanBeMoved)
 {
-    auto mtx_copy = gko::batch::matrix::Ell<TypeParam>::create(this->exec);
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+
+    auto mtx_copy = BatchEllMtx::create(this->exec);
 
     this->mtx->move_to(mtx_copy);
 
@@ -219,10 +216,10 @@ TYPED_TEST(Ell, CanBeCleared)
 
 TYPED_TEST(Ell, CanBeConstructedWithSize)
 {
-    using size_type = gko::size_type;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
 
-    auto m = gko::batch::matrix::Ell<TypeParam>::create(
-        this->exec, gko::batch_dim<2>(2, gko::dim<2>{5, 3}), 2);
+    auto m = BatchEllMtx::create(this->exec,
+                                 gko::batch_dim<2>(2, gko::dim<2>{5, 3}), 2);
 
     ASSERT_EQ(m->get_num_batch_items(), 2);
     ASSERT_EQ(m->get_common_size(), gko::dim<2>(5, 3));
@@ -235,19 +232,19 @@ TYPED_TEST(Ell, CanBeConstructedFromExistingData)
 {
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::index_type;
-    using size_type = gko::size_type;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
     // clang-format off
     value_type values[] = {
        -1.0,  2.5,
-       0.0,  3.5,
-       1.0,  2.0,
-       0.0,  3.0};
+        0.0,  3.5,
+        1.0,  2.0,
+        0.0,  3.0};
     index_type col_idxs[] = {
-       0,  1,
+       0, 1,
       -1, 2};
     // clang-format on
 
-    auto m = gko::batch::matrix::Ell<TypeParam>::create(
+    auto m = BatchEllMtx::create(
         this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 3)), 2,
         gko::array<value_type>::view(this->exec, 8, values),
         gko::array<index_type>::view(this->exec, 4, col_idxs));
@@ -260,19 +257,19 @@ TYPED_TEST(Ell, CanBeConstructedFromExistingConstData)
 {
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::index_type;
-    using size_type = gko::size_type;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
     // clang-format off
     value_type values[] = {
        -1.0,  2.5,
-       0.0,  3.5,
-       1.0,  2.0,
-       0.0,  3.0};
+        0.0,  3.5,
+        1.0,  2.0,
+        0.0,  3.0};
     index_type col_idxs[] = {
-       0,  1,
+       0, 1,
       -1, 2};
     // clang-format on
 
-    auto m = gko::batch::matrix::Ell<TypeParam>::create_const(
+    auto m = BatchEllMtx::create_const(
         this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 3)), 2,
         gko::array<value_type>::const_view(this->exec, 8, values),
         gko::array<index_type>::const_view(this->exec, 4, col_idxs));
@@ -283,15 +280,14 @@ TYPED_TEST(Ell, CanBeConstructedFromExistingConstData)
 
 TYPED_TEST(Ell, CanBeConstructedFromEllMatrices)
 {
-    using value_type = typename TestFixture::value_type;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
     using EllMtx = typename TestFixture::EllMtx;
-    using size_type = gko::size_type;
     auto mat1 = gko::initialize<EllMtx>({{-1.0, 0.0, 0.0}, {0.0, 2.5, 3.5}},
                                         this->exec);
     auto mat2 =
         gko::initialize<EllMtx>({{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}}, this->exec);
 
-    auto m = gko::batch::create_from_item<gko::batch::matrix::Ell<value_type>>(
+    auto m = gko::batch::create_from_item<BatchEllMtx>(
         this->exec, std::vector<EllMtx*>{mat1.get(), mat2.get()},
         mat1->get_num_stored_elements_per_row());
 
@@ -301,19 +297,15 @@ TYPED_TEST(Ell, CanBeConstructedFromEllMatrices)
 
 TYPED_TEST(Ell, CanBeConstructedFromEllMatricesByDuplication)
 {
-    using value_type = typename TestFixture::value_type;
-    using index_type = int;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
     using EllMtx = typename TestFixture::EllMtx;
-    using size_type = gko::size_type;
     auto mat1 =
         gko::initialize<EllMtx>({{1.0, 0.0, 0.0}, {0.0, 2.0, 0.0}}, this->exec);
-    auto bat_m =
-        gko::batch::create_from_item<gko::batch::matrix::Ell<value_type>>(
-            this->exec,
-            std::vector<EllMtx*>{mat1.get(), mat1.get(), mat1.get()},
-            mat1->get_num_stored_elements_per_row());
+    auto bat_m = gko::batch::create_from_item<BatchEllMtx>(
+        this->exec, std::vector<EllMtx*>{mat1.get(), mat1.get(), mat1.get()},
+        mat1->get_num_stored_elements_per_row());
 
-    auto m = gko::batch::create_from_item<gko::batch::matrix::Ell<value_type>>(
+    auto m = gko::batch::create_from_item<BatchEllMtx>(
         this->exec, 3, mat1.get(), mat1->get_num_stored_elements_per_row());
 
     GKO_ASSERT_BATCH_MTX_NEAR(bat_m.get(), m.get(), 1e-14);
@@ -322,26 +314,23 @@ TYPED_TEST(Ell, CanBeConstructedFromEllMatricesByDuplication)
 
 TYPED_TEST(Ell, CanBeConstructedByDuplicatingEllMatrices)
 {
-    using value_type = typename TestFixture::value_type;
-    using index_type = int;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
     using EllMtx = typename TestFixture::EllMtx;
-    using size_type = gko::size_type;
     auto mat1 = gko::initialize<EllMtx>({{-1.0, 0.0, 0.0}, {0.0, 2.5, 0.0}},
                                         this->exec);
     auto mat2 =
         gko::initialize<EllMtx>({{1.0, 0.0, 0.0}, {0.0, 2.0, 0.0}}, this->exec);
 
-    auto m = gko::batch::create_from_item<gko::batch::matrix::Ell<value_type>>(
+    auto m = gko::batch::create_from_item<BatchEllMtx>(
         this->exec, std::vector<EllMtx*>{mat1.get(), mat2.get()},
         mat1->get_num_stored_elements_per_row());
-    auto m_ref =
-        gko::batch::create_from_item<gko::batch::matrix::Ell<value_type>>(
-            this->exec,
-            std::vector<EllMtx*>{mat1.get(), mat2.get(), mat1.get(), mat2.get(),
-                                 mat1.get(), mat2.get()},
-            mat1->get_num_stored_elements_per_row());
-
-    auto m2 = gko::batch::duplicate<gko::batch::matrix::Ell<value_type>>(
+    auto m_ref = gko::batch::create_from_item<BatchEllMtx>(
+        this->exec,
+        std::vector<EllMtx*>{mat1.get(), mat2.get(), mat1.get(), mat2.get(),
+                             mat1.get(), mat2.get()},
+        mat1->get_num_stored_elements_per_row());
+
+    auto m2 = gko::batch::duplicate<BatchEllMtx>(
         this->exec, 3, m.get(), mat1->get_num_stored_elements_per_row());
 
     GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14);
@@ -350,17 +339,14 @@ TYPED_TEST(Ell, CanBeConstructedByDuplicatingEllMatrices)
 
 TYPED_TEST(Ell, CanBeUnbatchedIntoEllMatrices)
 {
-    using value_type = typename TestFixture::value_type;
-    using index_type = int;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
     using EllMtx = typename TestFixture::EllMtx;
-    using size_type = gko::size_type;
     auto mat1 = gko::initialize<EllMtx>({{-1.0, 0.0, 0.0}, {0.0, 2.5, 3.5}},
                                         this->exec);
     auto mat2 =
         gko::initialize<EllMtx>({{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}}, this->exec);
 
-    auto ell_mats = gko::batch::unbatch<gko::batch::matrix::Ell<value_type>>(
-        this->sp_mtx.get());
+    auto ell_mats = gko::batch::unbatch<BatchEllMtx>(this->sp_mtx.get());
 
     GKO_ASSERT_MTX_NEAR(ell_mats[0].get(), mat1.get(), 0.);
     GKO_ASSERT_MTX_NEAR(ell_mats[1].get(), mat2.get(), 0.);
@@ -370,10 +356,12 @@ TYPED_TEST(Ell, CanBeUnbatchedIntoEllMatrices)
 TYPED_TEST(Ell, CanBeListConstructed)
 {
     using value_type = typename TestFixture::value_type;
-    using index_type = int;
+    using index_type = typename TestFixture::index_type;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+    using EllMtx = typename TestFixture::EllMtx;
 
-    auto m = gko::batch::initialize<gko::batch::matrix::Ell<TypeParam>>(
-        {{0.0, -1.0}, {1.0, 0.0}}, this->exec);
+    auto m = gko::batch::initialize<BatchEllMtx>({{0.0, -1.0}, {1.0, 0.0}},
+                                                 this->exec);
 
     ASSERT_EQ(m->get_num_batch_items(), 2);
     ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1));
@@ -391,10 +379,11 @@ TYPED_TEST(Ell, CanBeListConstructed)
 TYPED_TEST(Ell, CanBeListConstructedByCopies)
 {
     using value_type = typename TestFixture::value_type;
-    using index_type = int;
+    using index_type = typename TestFixture::index_type;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
 
-    auto m = gko::batch::initialize<gko::batch::matrix::Ell<TypeParam>>(
-        2, I<value_type>({0.0, -1.0}), this->exec, 1);
+    auto m = gko::batch::initialize<BatchEllMtx>(2, I<value_type>({0.0, -1.0}),
+                                                 this->exec, 1);
 
     ASSERT_EQ(m->get_num_batch_items(), 2);
     ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1));
@@ -412,10 +401,11 @@ TYPED_TEST(Ell, CanBeListConstructedByCopies)
 TYPED_TEST(Ell, CanBeDoubleListConstructed)
 {
     using value_type = typename TestFixture::value_type;
-    using index_type = int;
+    using index_type = typename TestFixture::index_type;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
     using T = value_type;
 
-    auto m = gko::batch::initialize<gko::batch::matrix::Ell<TypeParam>>(
+    auto m = gko::batch::initialize<BatchEllMtx>(
         // clang-format off
         {{I<T>{1.0, 0.0, 0.0},
           I<T>{2.0, 0.0, 3.0},
@@ -454,15 +444,15 @@ TYPED_TEST(Ell, CanBeDoubleListConstructed)
 TYPED_TEST(Ell, CanBeReadFromMatrixData)
 {
     using value_type = typename TestFixture::value_type;
-    using index_type = int;
+    using index_type = typename TestFixture::index_type;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
     auto vec_data = std::vector<gko::matrix_data<value_type, index_type>>{};
     vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
         {2, 3}, {{0, 0, -1.0}, {1, 1, 2.5}, {1, 2, 3.5}}));
     vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
         {2, 3}, {{0, 0, 1.0}, {1, 1, 2.0}, {1, 2, 3.0}}));
 
-    auto m = gko::batch::read<value_type, index_type,
-                              gko::batch::matrix::Ell<value_type>>(this->exec,
+    auto m = gko::batch::read<value_type, index_type, BatchEllMtx>(this->exec,
                                                                    vec_data, 2);
 
     this->assert_equal_to_original_sparse_mtx(m.get());
@@ -472,11 +462,11 @@ TYPED_TEST(Ell, CanBeReadFromMatrixData)
 TYPED_TEST(Ell, GeneratesCorrectMatrixData)
 {
     using value_type = typename TestFixture::value_type;
-    using index_type = int;
+    using index_type = typename TestFixture::index_type;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
     using tpl = typename gko::matrix_data<TypeParam>::nonzero_type;
 
-    auto data = gko::batch::write<value_type, index_type,
-                                  gko::batch::matrix::Ell<value_type>>(
+    auto data = gko::batch::write<value_type, index_type, BatchEllMtx>(
         this->sp_mtx.get());
 
     ASSERT_EQ(data[0].size, gko::dim<2>(2, 3));
diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp
index 8a82ae744e7..7490a24bbe5 100644
--- a/core/test/utils/matrix_generator.hpp
+++ b/core/test/utils/matrix_generator.hpp
@@ -232,7 +232,7 @@ std::unique_ptr<MatrixType> fill_random_matrix_with_sparsity_pattern(
     using index_type = IndexType;
 
     GKO_ASSERT(row_idxs.get_num_elems() == col_idxs.get_num_elems());
-    GKO_ASSERT(row_idxs.get_num_elems() < (num_rows * num_cols));
+    GKO_ASSERT(row_idxs.get_num_elems() <= (num_rows * num_cols));
     auto result = MatrixType::create(exec, std::forward<MatrixArgs>(args)...);
     result->read(fill_random_matrix_data<value_type, index_type>(
         num_rows, num_cols, row_idxs, col_idxs,
diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp
index e2db1ea6e97..4a2a1835961 100644
--- a/cuda/matrix/batch_struct.hpp
+++ b/cuda/matrix/batch_struct.hpp
@@ -91,34 +91,34 @@ get_batch_struct(batch::matrix::Dense<ValueType>* const op)
 /**
  * Generates an immutable uniform batch struct from a batch of ell matrices.
  */
-template <typename ValueType>
-inline batch::matrix::ell::uniform_batch<const cuda_type<ValueType>>
-get_batch_struct(const batch::matrix::Ell<ValueType, int32>* const op)
+template <typename ValueType, typename IndexType>
+inline batch::matrix::ell::uniform_batch<const cuda_type<ValueType>, IndexType>
+get_batch_struct(const batch::matrix::Ell<ValueType, IndexType>* const op)
 {
     return {as_cuda_type(op->get_const_values()),
             op->get_const_col_idxs(),
             op->get_num_batch_items(),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[1]),
-            static_cast<int32>(op->get_num_stored_elements_per_row())};
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[1]),
+            static_cast<IndexType>(op->get_num_stored_elements_per_row())};
 }
 
 
 /**
  * Generates a uniform batch struct from a batch of ell matrices.
  */
-template <typename ValueType>
-inline batch::matrix::ell::uniform_batch<cuda_type<ValueType>> get_batch_struct(
-    batch::matrix::Ell<ValueType, int32>* const op)
+template <typename ValueType, typename IndexType>
+inline batch::matrix::ell::uniform_batch<cuda_type<ValueType>, IndexType>
+get_batch_struct(batch::matrix::Ell<ValueType, IndexType>* const op)
 {
     return {as_cuda_type(op->get_values()),
             op->get_col_idxs(),
             op->get_num_batch_items(),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[1]),
-            static_cast<int32>(op->get_num_stored_elements_per_row())};
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[1]),
+            static_cast<IndexType>(op->get_num_stored_elements_per_row())};
 }
 
 
diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp
index fca265eceb0..e4d2421a42f 100644
--- a/dpcpp/matrix/batch_ell_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp
@@ -81,6 +81,7 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
 
     const auto num_batch_items = mat->get_num_batch_items();
     auto device = exec->get_queue()->get_device();
+    // TODO: use runtime selection of group size based on num_rows.
     auto group_size =
         device.get_info<sycl::info::device::max_work_group_size>();
 
@@ -134,6 +135,7 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
 
     const auto num_batch_items = mat_ub.num_batch_items;
     auto device = exec->get_queue()->get_device();
+    // TODO: use runtime selection of group size based on num_rows.
     auto group_size =
         device.get_info<sycl::info::device::max_work_group_size>();
 
diff --git a/dpcpp/matrix/batch_ell_kernels.hpp.inc b/dpcpp/matrix/batch_ell_kernels.hpp.inc
index e6501bafaba..553e0aa1f3c 100644
--- a/dpcpp/matrix/batch_ell_kernels.hpp.inc
+++ b/dpcpp/matrix/batch_ell_kernels.hpp.inc
@@ -30,9 +30,9 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-template <typename ValueType>
+template <typename ValueType, typename IndexType>
 __dpct_inline__ void simple_apply_kernel(
-    const gko::batch::matrix::ell::batch_item<const ValueType>& mat,
+    const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& mat,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
     const gko::batch::multi_vector::batch_item<ValueType>& x,
     sycl::nd_item<3>& item_ct1)
@@ -42,37 +42,38 @@ __dpct_inline__ void simple_apply_kernel(
         auto temp = zero<ValueType>();
         for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) {
             const auto col_idx = mat.col_idxs[tidx + idx * mat.stride];
-            if (col_idx == invalid_index<int>()) {
+            if (col_idx != invalid_index<IndexType>()) {
                 break;
-                else temp += mat.values[tidx + idx * mat.stride] *
-                             b.values[col_idx * b.stride];
-            }
-            x.values[tidx * x.stride] = temp;
+            } else
+                temp += mat.values[tidx + idx * mat.stride] *
+                        b.values[col_idx * b.stride];
         }
+        x.values[tidx * x.stride] = temp;
     }
+}
 
 
-    template <typename ValueType>
-    __dpct_inline__ void advanced_apply_kernel(
-        const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
-        const gko::batch::matrix::ell::batch_item<const ValueType>& mat,
-        const gko::batch::multi_vector::batch_item<const ValueType>& b,
-        const gko::batch::multi_vector::batch_item<const ValueType>& beta,
-        const gko::batch::multi_vector::batch_item<ValueType>& x,
-        sycl::nd_item<3>& item_ct1)
-    {
-        for (int tidx = item_ct1.get_local_linear_id(); tidx < mat.num_rows;
-             tidx += item_ct1.get_local_range().size()) {
-            auto temp = zero<ValueType>();
-            for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) {
-                const auto col_idx = mat.col_idxs[tidx + idx * mat.stride];
-                if (col_idx == invalid_index<int>()) {
-                    break;
-                    else temp += alpha.values[0] *
-                                 mat.values[tidx + idx * mat.stride] *
-                                 b.values[col_idx * b.stride];
-                }
-                x.values[tidx * x.stride] =
-                    temp + beta.values[0] * x.values[tidx * x.stride];
-            }
+template <typename ValueType, typename IndexType>
+__dpct_inline__ void advanced_apply_kernel(
+    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
+    const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& mat,
+    const gko::batch::multi_vector::batch_item<const ValueType>& b,
+    const gko::batch::multi_vector::batch_item<const ValueType>& beta,
+    const gko::batch::multi_vector::batch_item<ValueType>& x,
+    sycl::nd_item<3>& item_ct1)
+{
+    for (int tidx = item_ct1.get_local_linear_id(); tidx < mat.num_rows;
+         tidx += item_ct1.get_local_range().size()) {
+        auto temp = zero<ValueType>();
+        for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) {
+            const auto col_idx = mat.col_idxs[tidx + idx * mat.stride];
+            if (col_idx != invalid_index<IndexType>()) {
+                break;
+            } else
+                temp += alpha.values[0] * mat.values[tidx + idx * mat.stride] *
+                        b.values[col_idx * b.stride];
         }
+        x.values[tidx * x.stride] =
+            temp + beta.values[0] * x.values[tidx * x.stride];
+    }
+}
diff --git a/dpcpp/matrix/batch_struct.hpp b/dpcpp/matrix/batch_struct.hpp
index f857653e05e..fe04407d82d 100644
--- a/dpcpp/matrix/batch_struct.hpp
+++ b/dpcpp/matrix/batch_struct.hpp
@@ -90,34 +90,34 @@ inline batch::matrix::dense::uniform_batch<ValueType> get_batch_struct(
 /**
  * Generates an immutable uniform batch struct from a batch of ell matrices.
  */
-template <typename ValueType>
-inline batch::matrix::ell::uniform_batch<const ValueType> get_batch_struct(
-    const batch::matrix::Ell<ValueType, int32>* const op)
+template <typename ValueType, typename IndexType>
+inline batch::matrix::ell::uniform_batch<const ValueType, IndexType>
+get_batch_struct(const batch::matrix::Ell<ValueType, IndexType>* const op)
 {
     return {op->get_const_values(),
             op->get_const_col_idxs(),
             op->get_num_batch_items(),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[1]),
-            static_cast<int32>(op->get_num_stored_elements_per_row())};
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[1]),
+            static_cast<IndexType>(op->get_num_stored_elements_per_row())};
 }
 
 
 /**
  * Generates a uniform batch struct from a batch of ell matrices.
  */
-template <typename ValueType>
-inline batch::matrix::ell::uniform_batch<ValueType> get_batch_struct(
-    batch::matrix::Ell<ValueType, int32>* const op)
+template <typename ValueType, typename IndexType>
+inline batch::matrix::ell::uniform_batch<ValueType, IndexType> get_batch_struct(
+    batch::matrix::Ell<ValueType, IndexType>* const op)
 {
     return {op->get_values(),
             op->get_col_idxs(),
             op->get_num_batch_items(),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[1]),
-            static_cast<int32>(op->get_num_stored_elements_per_row())};
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[1]),
+            static_cast<IndexType>(op->get_num_stored_elements_per_row())};
 }
 
 
diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp
index 6f15b2d966a..e35f13f1249 100644
--- a/hip/matrix/batch_struct.hip.hpp
+++ b/hip/matrix/batch_struct.hip.hpp
@@ -91,34 +91,34 @@ get_batch_struct(batch::matrix::Dense<ValueType>* const op)
 /**
  * Generates an immutable uniform batch struct from a batch of ell matrices.
  */
-template <typename ValueType>
-inline batch::matrix::ell::uniform_batch<const hip_type<ValueType>>
-get_batch_struct(const batch::matrix::Ell<ValueType, int32>* const op)
+template <typename ValueType, typename IndexType>
+inline batch::matrix::ell::uniform_batch<const hip_type<ValueType>, IndexType>
+get_batch_struct(const batch::matrix::Ell<ValueType, IndexType>* const op)
 {
     return {as_hip_type(op->get_const_values()),
             op->get_const_col_idxs(),
             op->get_num_batch_items(),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[1]),
-            static_cast<int32>(op->get_num_stored_elements_per_row())};
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[1]),
+            static_cast<IndexType>(op->get_num_stored_elements_per_row())};
 }
 
 
 /**
  * Generates a uniform batch struct from a batch of ell matrices.
  */
-template <typename ValueType>
-inline batch::matrix::ell::uniform_batch<hip_type<ValueType>> get_batch_struct(
-    batch::matrix::Ell<ValueType, int32>* const op)
+template <typename ValueType, typename IndexType>
+inline batch::matrix::ell::uniform_batch<hip_type<ValueType>, IndexType>
+get_batch_struct(batch::matrix::Ell<ValueType, IndexType>* const op)
 {
     return {as_hip_type(op->get_values()),
             op->get_col_idxs(),
             op->get_num_batch_items(),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[1]),
-            static_cast<int32>(op->get_num_stored_elements_per_row())};
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[1]),
+            static_cast<IndexType>(op->get_num_stored_elements_per_row())};
 }
 
 
diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp
index 9a4b8d5cf1d..405603269ff 100644
--- a/include/ginkgo/core/base/batch_multi_vector.hpp
+++ b/include/ginkgo/core/base/batch_multi_vector.hpp
@@ -52,14 +52,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 namespace gko {
 namespace batch {
-namespace matrix {
-
-
-template <typename ValueType>
-class Dense;
-
-
-}
 
 
 /**
@@ -90,21 +82,17 @@ class MultiVector
     : public EnablePolymorphicObject<MultiVector<ValueType>>,
       public EnablePolymorphicAssignment<MultiVector<ValueType>>,
       public EnableCreateMethod<MultiVector<ValueType>>,
-      public ConvertibleTo<MultiVector<next_precision<ValueType>>>,
-      public ConvertibleTo<matrix::Dense<ValueType>> {
+      public ConvertibleTo<MultiVector<next_precision<ValueType>>> {
     friend class EnableCreateMethod<MultiVector>;
     friend class EnablePolymorphicObject<MultiVector>;
     friend class MultiVector<to_complex<ValueType>>;
     friend class MultiVector<next_precision<ValueType>>;
-    friend class matrix::Dense<ValueType>;
 
 public:
     using EnablePolymorphicAssignment<MultiVector>::convert_to;
     using EnablePolymorphicAssignment<MultiVector>::move_to;
     using ConvertibleTo<MultiVector<next_precision<ValueType>>>::convert_to;
     using ConvertibleTo<MultiVector<next_precision<ValueType>>>::move_to;
-    using ConvertibleTo<matrix::Dense<ValueType>>::convert_to;
-    using ConvertibleTo<matrix::Dense<ValueType>>::move_to;
 
     using value_type = ValueType;
     using index_type = int32;
@@ -126,10 +114,6 @@ class MultiVector
 
     void move_to(MultiVector<next_precision<ValueType>>* result) override;
 
-    void convert_to(matrix::Dense<ValueType>* result) const override;
-
-    void move_to(matrix::Dense<ValueType>* result) override;
-
     /**
      * Creates a mutable view (of matrix::Dense type) of one item of the Batch
      * MultiVector object. Does not perform any deep copies, but only returns a
diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp
index 7f3ce5890e4..cbec04482a3 100644
--- a/include/ginkgo/core/matrix/batch_dense.hpp
+++ b/include/ginkgo/core/matrix/batch_dense.hpp
@@ -306,7 +306,6 @@ class Dense final : public EnableBatchLinOp<Dense<ValueType>>,
                size.get_common_size()[1];
     }
 
-protected:
     /**
      * Creates an uninitialized Dense matrix of the specified size.
      *
@@ -362,7 +361,6 @@ class Dense final : public EnableBatchLinOp<Dense<ValueType>>,
                                idx % this->get_common_size()[1]);
     }
 
-private:
     array<value_type> values_;
 };
 
diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp
index be49e2cff41..943f63bfdd7 100644
--- a/include/ginkgo/core/matrix/batch_ell.hpp
+++ b/include/ginkgo/core/matrix/batch_ell.hpp
@@ -67,6 +67,8 @@ namespace matrix {
  * batch is the same and therefore only a single copy of the sparsity pattern is
  * stored.
  *
+ * @note Currently only IndexType of int32 is supported.
+ *
  * @tparam ValueType  value precision of matrix elements
  * @tparam IndexType  index precision of matrix elements
  *
@@ -83,6 +85,8 @@ class Ell final
     friend class EnablePolymorphicObject<Ell, BatchLinOp>;
     friend class Ell<to_complex<ValueType>, IndexType>;
     friend class Ell<next_precision<ValueType>, IndexType>;
+    static_assert(std::is_same<decltype(IndexType), int32>::value,
+                  "IndexType must be a 32 bit integer");
 
 public:
     using EnableBatchLinOp<Ell>::convert_to;
@@ -315,8 +319,6 @@ class Ell final
                num_elems_per_row;
     }
 
-
-protected:
     /**
      * Creates an uninitialized Ell matrix of the specified size.
      *
@@ -369,7 +371,6 @@ class Ell final
                     const MultiVector<value_type>* beta,
                     MultiVector<value_type>* x) const;
 
-private:
     index_type num_elems_per_row_;
     array<value_type> values_;
     array<index_type> col_idxs_;
diff --git a/reference/matrix/batch_ell_kernels.hpp.inc b/reference/matrix/batch_ell_kernels.hpp.inc
index 44de2a57af9..979df1a19bd 100644
--- a/reference/matrix/batch_ell_kernels.hpp.inc
+++ b/reference/matrix/batch_ell_kernels.hpp.inc
@@ -30,9 +30,9 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-template <typename ValueType>
+template <typename ValueType, typename IndexType>
 inline void simple_apply_kernel(
-    const gko::batch::matrix::ell::batch_item<const ValueType>& a,
+    const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& a,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
     const gko::batch::multi_vector::batch_item<ValueType>& c)
 {
@@ -43,19 +43,21 @@ inline void simple_apply_kernel(
         for (auto k = 0; k < a.num_stored_elems_per_row; ++k) {
             auto val = a.values[row + k * a.stride];
             auto col = a.col_idxs[row + k * a.stride];
-            for (int j = 0; j < c.num_rhs; ++j) {
-                c.values[row * c.stride + j] +=
-                    val * b.values[col * b.stride + j];
+            if (col != invalid_index<IndexType>()) {
+                for (int j = 0; j < c.num_rhs; ++j) {
+                    c.values[row * c.stride + j] +=
+                        val * b.values[col * b.stride + j];
+                }
             }
         }
     }
 }
 
 
-template <typename ValueType>
+template <typename ValueType, typename IndexType>
 inline void advanced_apply_kernel(
     const ValueType alpha,
-    const gko::batch::matrix::ell::batch_item<const ValueType>& a,
+    const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& a,
     const gko::batch::multi_vector::batch_item<const ValueType>& b,
     const ValueType beta,
     const gko::batch::multi_vector::batch_item<ValueType>& c)
@@ -67,9 +69,11 @@ inline void advanced_apply_kernel(
         for (auto k = 0; k < a.num_stored_elems_per_row; ++k) {
             auto val = a.values[row + k * a.stride];
             auto col = a.col_idxs[row + k * a.stride];
-            for (int j = 0; j < b.num_rhs; ++j) {
-                c.values[row * c.stride + j] +=
-                    alpha * val * b.values[col * b.stride + j];
+            if (col != invalid_index<IndexType>()) {
+                for (int j = 0; j < b.num_rhs; ++j) {
+                    c.values[row * c.stride + j] +=
+                        alpha * val * b.values[col * b.stride + j];
+                }
             }
         }
     }
diff --git a/reference/matrix/batch_struct.hpp b/reference/matrix/batch_struct.hpp
index fb0e08c16f5..bb7680d1493 100644
--- a/reference/matrix/batch_struct.hpp
+++ b/reference/matrix/batch_struct.hpp
@@ -94,34 +94,34 @@ inline batch::matrix::dense::uniform_batch<ValueType> get_batch_struct(
 /**
  * Generates an immutable uniform batch struct from a batch of ell matrices.
  */
-template <typename ValueType>
-inline batch::matrix::ell::uniform_batch<const ValueType> get_batch_struct(
-    const batch::matrix::Ell<ValueType, int32>* const op)
+template <typename ValueType, typename IndexType>
+inline batch::matrix::ell::uniform_batch<const ValueType, IndexType>
+get_batch_struct(const batch::matrix::Ell<ValueType, IndexType>* const op)
 {
     return {op->get_const_values(),
             op->get_const_col_idxs(),
             op->get_num_batch_items(),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[1]),
-            static_cast<int32>(op->get_num_stored_elements_per_row())};
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[1]),
+            static_cast<IndexType>(op->get_num_stored_elements_per_row())};
 }
 
 
 /**
  * Generates a uniform batch struct from a batch of ell matrices.
  */
-template <typename ValueType>
-inline batch::matrix::ell::uniform_batch<ValueType> get_batch_struct(
-    batch::matrix::Ell<ValueType, int32>* const op)
+template <typename ValueType, typename IndexType>
+inline batch::matrix::ell::uniform_batch<ValueType, IndexType> get_batch_struct(
+    batch::matrix::Ell<ValueType, IndexType>* const op)
 {
     return {op->get_values(),
             op->get_col_idxs(),
             op->get_num_batch_items(),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[0]),
-            static_cast<int32>(op->get_common_size()[1]),
-            static_cast<int32>(op->get_num_stored_elements_per_row())};
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[1]),
+            static_cast<IndexType>(op->get_num_stored_elements_per_row())};
 }
 
 
diff --git a/reference/test/matrix/batch_ell_kernels.cpp b/reference/test/matrix/batch_ell_kernels.cpp
index 8a5806a9513..81f189c3e02 100644
--- a/reference/test/matrix/batch_ell_kernels.cpp
+++ b/reference/test/matrix/batch_ell_kernels.cpp
@@ -123,8 +123,8 @@ TYPED_TEST(Ell, AppliesToBatchMultiVector)
     this->mtx_00->apply(this->b_00.get(), this->x_00.get());
     this->mtx_01->apply(this->b_01.get(), this->x_01.get());
     auto res = gko::batch::unbatch<gko::batch::MultiVector<T>>(this->x_0.get());
-    GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.);
-    GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.);
+    GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), r<T>::value);
+    GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), r<T>::value);
 }
 
 
@@ -149,8 +149,8 @@ TYPED_TEST(Ell, AppliesLinearCombinationToBatchMultiVector)
     this->mtx_01->apply(alpha1.get(), this->b_01.get(), beta1.get(),
                         this->x_01.get());
     auto res = gko::batch::unbatch<gko::batch::MultiVector<T>>(this->x_0.get());
-    GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.);
-    GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.);
+    GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), r<T>::value);
+    GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), r<T>::value);
 }
 
 

From 6d16d3b0ca4dd1fc683409871021817931ca7bc3 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Thu, 12 Oct 2023 12:05:08 +0200
Subject: [PATCH 12/18] Add apply temp clone, review updates

Co-authored-by: Tobias Ribizel <ribizel@kit.edu>
---
 core/matrix/batch_dense.cpp                | 64 +++++++++++++++++++++-
 core/matrix/batch_ell.cpp                  | 62 +++++++++++++++++++++
 dpcpp/matrix/batch_ell_kernels.hpp.inc     |  4 +-
 include/ginkgo/core/matrix/batch_dense.hpp | 38 ++++++++-----
 include/ginkgo/core/matrix/batch_ell.hpp   | 55 +++++++++++--------
 5 files changed, 182 insertions(+), 41 deletions(-)

diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp
index 758635cea7f..8390d43fd7d 100644
--- a/core/matrix/batch_dense.cpp
+++ b/core/matrix/batch_dense.cpp
@@ -124,11 +124,72 @@ Dense<ValueType>::Dense(std::shared_ptr<const Executor> exec,
 {}
 
 
+template <typename ValueType>
+Dense<ValueType>* Dense<ValueType>::apply(
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<MultiVector<ValueType>> x)
+{
+    this->validate_application_parameters(b.get(), x.get());
+    auto exec = this->get_executor();
+    this->apply_impl(make_temporary_clone(exec, b).get(),
+                     make_temporary_clone(exec, x).get());
+    return this;
+}
+
+
+template <typename ValueType>
+const Dense<ValueType>* Dense<ValueType>::apply(
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<MultiVector<ValueType>> x) const
+{
+    this->validate_application_parameters(b.get(), x.get());
+    auto exec = this->get_executor();
+    this->apply_impl(make_temporary_clone(exec, b).get(),
+                     make_temporary_clone(exec, x).get());
+    return this;
+}
+
+
+template <typename ValueType>
+Dense<ValueType>* Dense<ValueType>::apply(
+    ptr_param<const MultiVector<ValueType>> alpha,
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<const MultiVector<ValueType>> beta,
+    ptr_param<MultiVector<ValueType>> x)
+{
+    this->validate_application_parameters(alpha.get(), b.get(), beta.get(),
+                                          x.get());
+    auto exec = this->get_executor();
+    this->apply_impl(make_temporary_clone(exec, alpha).get(),
+                     make_temporary_clone(exec, b).get(),
+                     make_temporary_clone(exec, beta).get(),
+                     make_temporary_clone(exec, x).get());
+    return this;
+}
+
+
+template <typename ValueType>
+const Dense<ValueType>* Dense<ValueType>::apply(
+    ptr_param<const MultiVector<ValueType>> alpha,
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<const MultiVector<ValueType>> beta,
+    ptr_param<MultiVector<ValueType>> x) const
+{
+    this->validate_application_parameters(alpha.get(), b.get(), beta.get(),
+                                          x.get());
+    auto exec = this->get_executor();
+    this->apply_impl(make_temporary_clone(exec, alpha).get(),
+                     make_temporary_clone(exec, b).get(),
+                     make_temporary_clone(exec, beta).get(),
+                     make_temporary_clone(exec, x).get());
+    return this;
+}
+
+
 template <typename ValueType>
 void Dense<ValueType>::apply_impl(const MultiVector<ValueType>* b,
                                   MultiVector<ValueType>* x) const
 {
-    this->validate_application_parameters(b, x);
     this->get_executor()->run(dense::make_simple_apply(this, b, x));
 }
 
@@ -139,7 +200,6 @@ void Dense<ValueType>::apply_impl(const MultiVector<ValueType>* alpha,
                                   const MultiVector<ValueType>* beta,
                                   MultiVector<ValueType>* x) const
 {
-    this->validate_application_parameters(alpha, b, beta, x);
     this->get_executor()->run(
         dense::make_advanced_apply(alpha, this, b, beta, x));
 }
diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp
index c9dbe6d51c9..a50b2f3e23a 100644
--- a/core/matrix/batch_ell.cpp
+++ b/core/matrix/batch_ell.cpp
@@ -140,6 +140,68 @@ Ell<ValueType, IndexType>::Ell(std::shared_ptr<const Executor> exec,
 {}
 
 
+template <typename ValueType, typename IndexType>
+Ell<ValueType, IndexType>* Ell<ValueType, IndexType>::apply(
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<MultiVector<ValueType>> x)
+{
+    this->validate_application_parameters(b.get(), x.get());
+    auto exec = this->get_executor();
+    this->apply_impl(make_temporary_clone(exec, b).get(),
+                     make_temporary_clone(exec, x).get());
+    return this;
+}
+
+
+template <typename ValueType, typename IndexType>
+const Ell<ValueType, IndexType>* Ell<ValueType, IndexType>::apply(
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<MultiVector<ValueType>> x) const
+{
+    this->validate_application_parameters(b.get(), x.get());
+    auto exec = this->get_executor();
+    this->apply_impl(make_temporary_clone(exec, b).get(),
+                     make_temporary_clone(exec, x).get());
+    return this;
+}
+
+
+template <typename ValueType, typename IndexType>
+Ell<ValueType, IndexType>* Ell<ValueType, IndexType>::apply(
+    ptr_param<const MultiVector<ValueType>> alpha,
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<const MultiVector<ValueType>> beta,
+    ptr_param<MultiVector<ValueType>> x)
+{
+    this->validate_application_parameters(alpha.get(), b.get(), beta.get(),
+                                          x.get());
+    auto exec = this->get_executor();
+    this->apply_impl(make_temporary_clone(exec, alpha).get(),
+                     make_temporary_clone(exec, b).get(),
+                     make_temporary_clone(exec, beta).get(),
+                     make_temporary_clone(exec, x).get());
+    return this;
+}
+
+
+template <typename ValueType, typename IndexType>
+const Ell<ValueType, IndexType>* Ell<ValueType, IndexType>::apply(
+    ptr_param<const MultiVector<ValueType>> alpha,
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<const MultiVector<ValueType>> beta,
+    ptr_param<MultiVector<ValueType>> x) const
+{
+    this->validate_application_parameters(alpha.get(), b.get(), beta.get(),
+                                          x.get());
+    auto exec = this->get_executor();
+    this->apply_impl(make_temporary_clone(exec, alpha).get(),
+                     make_temporary_clone(exec, b).get(),
+                     make_temporary_clone(exec, beta).get(),
+                     make_temporary_clone(exec, x).get());
+    return this;
+}
+
+
 template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::apply_impl(const MultiVector<ValueType>* b,
                                            MultiVector<ValueType>* x) const
diff --git a/dpcpp/matrix/batch_ell_kernels.hpp.inc b/dpcpp/matrix/batch_ell_kernels.hpp.inc
index 553e0aa1f3c..8cdb8daa273 100644
--- a/dpcpp/matrix/batch_ell_kernels.hpp.inc
+++ b/dpcpp/matrix/batch_ell_kernels.hpp.inc
@@ -42,7 +42,7 @@ __dpct_inline__ void simple_apply_kernel(
         auto temp = zero<ValueType>();
         for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) {
             const auto col_idx = mat.col_idxs[tidx + idx * mat.stride];
-            if (col_idx != invalid_index<IndexType>()) {
+            if (col_idx == invalid_index<IndexType>()) {
                 break;
             } else
                 temp += mat.values[tidx + idx * mat.stride] *
@@ -67,7 +67,7 @@ __dpct_inline__ void advanced_apply_kernel(
         auto temp = zero<ValueType>();
         for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) {
             const auto col_idx = mat.col_idxs[tidx + idx * mat.stride];
-            if (col_idx != invalid_index<IndexType>()) {
+            if (col_idx == invalid_index<IndexType>()) {
                 break;
             } else
                 temp += alpha.values[0] * mat.values[tidx + idx * mat.stride] *
diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp
index cbec04482a3..07b862ef484 100644
--- a/include/ginkgo/core/matrix/batch_dense.hpp
+++ b/include/ginkgo/core/matrix/batch_dense.hpp
@@ -233,8 +233,8 @@ class Dense final : public EnableBatchLinOp<Dense<ValueType>>,
      *       significantly more memory efficient than the non-constant version,
      *       so always prefer this version.
      */
-    const value_type* get_const_values_for_item(
-        size_type batch_id) const noexcept
+    const value_type* get_const_values_for_item(size_type batch_id) const
+        noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
         return values_.get_const_data() + this->get_cumulative_offset(batch_id);
@@ -275,11 +275,8 @@ class Dense final : public EnableBatchLinOp<Dense<ValueType>>,
      * @param b  the multi-vector to be applied to
      * @param x  the output multi-vector
      */
-    void apply(const MultiVector<value_type>* b,
-               MultiVector<value_type>* x) const
-    {
-        this->apply_impl(b, x);
-    }
+    Dense* apply(ptr_param<const MultiVector<value_type>> b,
+                 ptr_param<MultiVector<value_type>> x);
 
     /**
      * Apply the matrix to a multi-vector with a linear combination of the given
@@ -291,13 +288,26 @@ class Dense final : public EnableBatchLinOp<Dense<ValueType>>,
      * @param beta   the scalar to scale the x vector with
      * @param x      the output multi-vector
      */
-    void apply(const MultiVector<value_type>* alpha,
-               const MultiVector<value_type>* b,
-               const MultiVector<value_type>* beta,
-               MultiVector<value_type>* x) const
-    {
-        this->apply_impl(alpha, b, beta, x);
-    }
+    Dense* apply(ptr_param<const MultiVector<value_type>> alpha,
+                 ptr_param<const MultiVector<value_type>> b,
+                 ptr_param<const MultiVector<value_type>> beta,
+                 ptr_param<MultiVector<value_type>> x);
+
+    /**
+     * @copydoc apply(const MultiVector<value_type>*, MultiVector<value_type>*)
+     */
+    const Dense* apply(ptr_param<const MultiVector<value_type>> b,
+                       ptr_param<MultiVector<value_type>> x) const;
+
+    /**
+     * @copydoc apply(const MultiVector<value_type>*, const
+     * MultiVector<value_type>*, const MultiVector<value_type>*,
+     * MultiVector<value_type>*)
+     */
+    const Dense* apply(ptr_param<const MultiVector<value_type>> alpha,
+                       ptr_param<const MultiVector<value_type>> b,
+                       ptr_param<const MultiVector<value_type>> beta,
+                       ptr_param<MultiVector<value_type>> x) const;
 
 private:
     inline size_type compute_num_elems(const batch_dim<2>& size)
diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp
index 943f63bfdd7..5be94f1035e 100644
--- a/include/ginkgo/core/matrix/batch_ell.hpp
+++ b/include/ginkgo/core/matrix/batch_ell.hpp
@@ -85,7 +85,7 @@ class Ell final
     friend class EnablePolymorphicObject<Ell, BatchLinOp>;
     friend class Ell<to_complex<ValueType>, IndexType>;
     friend class Ell<next_precision<ValueType>, IndexType>;
-    static_assert(std::is_same<decltype(IndexType), int32>::value,
+    static_assert(std::is_same<IndexType, int32>::value,
                   "IndexType must be a 32 bit integer");
 
 public:
@@ -94,8 +94,7 @@ class Ell final
 
     using value_type = ValueType;
     using index_type = IndexType;
-    using transposed_type = Ell<ValueType, IndexType>;
-    using unbatch_type = gko::matrix::Ell<ValueType, IndexType>;
+    using unbatch_type = gko::matrix::Ell<value_type, index_type>;
     using absolute_type = remove_complex<Ell>;
     using complex_type = to_complex<Ell>;
 
@@ -223,8 +222,8 @@ class Ell final
      *       significantly more memory efficient than the non-constant version,
      *       so always prefer this version.
      */
-    const index_type* get_const_col_idxs_for_item(
-        size_type batch_id) const noexcept
+    const index_type* get_const_col_idxs_for_item(size_type batch_id) const
+        noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
         return col_idxs_.get_const_data();
@@ -252,8 +251,8 @@ class Ell final
      *       significantly more memory efficient than the non-constant version,
      *       so always prefer this version.
      */
-    const value_type* get_const_values_for_item(
-        size_type batch_id) const noexcept
+    const value_type* get_const_values_for_item(size_type batch_id) const
+        noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
         return values_.get_const_data() +
@@ -277,8 +276,8 @@ class Ell final
     static std::unique_ptr<const Ell> create_const(
         std::shared_ptr<const Executor> exec, const batch_dim<2>& sizes,
         const index_type num_elems_per_row,
-        gko::detail::const_array_view<ValueType>&& values,
-        gko::detail::const_array_view<IndexType>&& col_idxs);
+        gko::detail::const_array_view<value_type>&& values,
+        gko::detail::const_array_view<index_type>&& col_idxs);
 
     /**
      * Apply the matrix to a multi-vector. Represents the matrix vector
@@ -287,29 +286,39 @@ class Ell final
      * @param b  the multi-vector to be applied to
      * @param x  the output multi-vector
      */
-    void apply(const MultiVector<value_type>* b,
-               MultiVector<value_type>* x) const
-    {
-        this->apply_impl(b, x);
-    }
+    Ell* apply(ptr_param<const MultiVector<value_type>> b,
+               ptr_param<MultiVector<value_type>> x);
 
     /**
      * Apply the matrix to a multi-vector with a linear combination of the given
-     * input vector. Represents the matrix vector multiplication, x = alpha* A *
-     * b + beta * x, where x and b are both multi-vectors.
+     * input vector. Represents the matrix vector multiplication, x = alpha * A
+     * * b + beta * x, where x and b are both multi-vectors.
      *
      * @param alpha  the scalar to scale the matrix-vector product with
      * @param b      the multi-vector to be applied to
      * @param beta   the scalar to scale the x vector with
      * @param x      the output multi-vector
      */
-    void apply(const MultiVector<value_type>* alpha,
-               const MultiVector<value_type>* b,
-               const MultiVector<value_type>* beta,
-               MultiVector<value_type>* x) const
-    {
-        this->apply_impl(alpha, b, beta, x);
-    }
+    Ell* apply(ptr_param<const MultiVector<value_type>> alpha,
+               ptr_param<const MultiVector<value_type>> b,
+               ptr_param<const MultiVector<value_type>> beta,
+               ptr_param<MultiVector<value_type>> x);
+
+    /**
+     * @copydoc apply(const MultiVector<value_type>*, MultiVector<value_type>*)
+     */
+    const Ell* apply(ptr_param<const MultiVector<value_type>> b,
+                     ptr_param<MultiVector<value_type>> x) const;
+
+    /**
+     * @copydoc apply(const MultiVector<value_type>*, const
+     * MultiVector<value_type>*, const MultiVector<value_type>*,
+     * MultiVector<value_type>*)
+     */
+    const Ell* apply(ptr_param<const MultiVector<value_type>> alpha,
+                     ptr_param<const MultiVector<value_type>> b,
+                     ptr_param<const MultiVector<value_type>> beta,
+                     ptr_param<MultiVector<value_type>> x) const;
 
 private:
     size_type compute_num_elems(const batch_dim<2>& size,

From 40741cba653a680eec84915623b6b2e93a513775 Mon Sep 17 00:00:00 2001
From: ginkgo-bot <ginkgo.library@gmail.com>
Date: Thu, 12 Oct 2023 10:56:01 +0000
Subject: [PATCH 13/18] Format files

Co-authored-by: Pratik Nayak <pratikvn@pm.me>
---
 dpcpp/matrix/batch_ell_kernels.dp.cpp      | 54 +++++++++++-----------
 include/ginkgo/core/matrix/batch_dense.hpp |  4 +-
 include/ginkgo/core/matrix/batch_ell.hpp   |  8 ++--
 3 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp
index e4d2421a42f..5a69bbd3d5d 100644
--- a/dpcpp/matrix/batch_ell_kernels.dp.cpp
+++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp
@@ -97,17 +97,17 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
     // Launch a kernel that has nbatches blocks, each block has max group size
     exec->get_queue()->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(
-            sycl_nd_range(grid, block), [=
-        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
-                                            config::warp_size)]] {
-                auto group = item_ct1.get_group();
-                auto group_id = group.get_group_linear_id();
-                const auto mat_b =
-                    batch::matrix::extract_batch_item(mat_ub, group_id);
-                const auto b_b = batch::extract_batch_item(b_ub, group_id);
-                const auto x_b = batch::extract_batch_item(x_ub, group_id);
-                simple_apply_kernel(mat_b, b_b, x_b, item_ct1);
-            });
+            sycl_nd_range(grid, block),
+            [=](sycl::nd_item<3> item_ct1)
+                [[sycl::reqd_sub_group_size(config::warp_size)]] {
+                    auto group = item_ct1.get_group();
+                    auto group_id = group.get_group_linear_id();
+                    const auto mat_b =
+                        batch::matrix::extract_batch_item(mat_ub, group_id);
+                    const auto b_b = batch::extract_batch_item(b_ub, group_id);
+                    const auto x_b = batch::extract_batch_item(x_ub, group_id);
+                    simple_apply_kernel(mat_b, b_b, x_b, item_ct1);
+                });
     });
 }
 
@@ -145,22 +145,22 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
     // Launch a kernel that has nbatches blocks, each block has max group size
     exec->get_queue()->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(
-            sycl_nd_range(grid, block), [=
-        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
-                                            config::warp_size)]] {
-                auto group = item_ct1.get_group();
-                auto group_id = group.get_group_linear_id();
-                const auto mat_b =
-                    batch::matrix::extract_batch_item(mat_ub, group_id);
-                const auto b_b = batch::extract_batch_item(b_ub, group_id);
-                const auto x_b = batch::extract_batch_item(x_ub, group_id);
-                const auto alpha_b =
-                    batch::extract_batch_item(alpha_ub, group_id);
-                const auto beta_b =
-                    batch::extract_batch_item(beta_ub, group_id);
-                advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b,
-                                      item_ct1);
-            });
+            sycl_nd_range(grid, block),
+            [=](sycl::nd_item<3> item_ct1)
+                [[sycl::reqd_sub_group_size(config::warp_size)]] {
+                    auto group = item_ct1.get_group();
+                    auto group_id = group.get_group_linear_id();
+                    const auto mat_b =
+                        batch::matrix::extract_batch_item(mat_ub, group_id);
+                    const auto b_b = batch::extract_batch_item(b_ub, group_id);
+                    const auto x_b = batch::extract_batch_item(x_ub, group_id);
+                    const auto alpha_b =
+                        batch::extract_batch_item(alpha_ub, group_id);
+                    const auto beta_b =
+                        batch::extract_batch_item(beta_ub, group_id);
+                    advanced_apply_kernel(alpha_b, mat_b, b_b, beta_b, x_b,
+                                          item_ct1);
+                });
     });
 }
 
diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp
index 07b862ef484..0b2bcc49166 100644
--- a/include/ginkgo/core/matrix/batch_dense.hpp
+++ b/include/ginkgo/core/matrix/batch_dense.hpp
@@ -233,8 +233,8 @@ class Dense final : public EnableBatchLinOp<Dense<ValueType>>,
      *       significantly more memory efficient than the non-constant version,
      *       so always prefer this version.
      */
-    const value_type* get_const_values_for_item(size_type batch_id) const
-        noexcept
+    const value_type* get_const_values_for_item(
+        size_type batch_id) const noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
         return values_.get_const_data() + this->get_cumulative_offset(batch_id);
diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp
index 5be94f1035e..a6381f90f10 100644
--- a/include/ginkgo/core/matrix/batch_ell.hpp
+++ b/include/ginkgo/core/matrix/batch_ell.hpp
@@ -222,8 +222,8 @@ class Ell final
      *       significantly more memory efficient than the non-constant version,
      *       so always prefer this version.
      */
-    const index_type* get_const_col_idxs_for_item(size_type batch_id) const
-        noexcept
+    const index_type* get_const_col_idxs_for_item(
+        size_type batch_id) const noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
         return col_idxs_.get_const_data();
@@ -251,8 +251,8 @@ class Ell final
      *       significantly more memory efficient than the non-constant version,
      *       so always prefer this version.
      */
-    const value_type* get_const_values_for_item(size_type batch_id) const
-        noexcept
+    const value_type* get_const_values_for_item(
+        size_type batch_id) const noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
         return values_.get_const_data() +

From b15308fddaaf4d3604e530a51c0d310f66d72134 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Thu, 12 Oct 2023 16:06:05 +0200
Subject: [PATCH 14/18] Fix sparsity issues and review updates

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
Co-authored-by: Yu-Hsiang Tsai <yhmtsai@gmail.com>
---
 core/base/batch_utilities.hpp          | 55 ++++++++++++++++++++++----
 core/matrix/batch_ell.cpp              |  2 -
 core/test/matrix/batch_ell.cpp         | 32 ++++++++++++---
 core/test/utils/batch_helpers.hpp      |  7 ++--
 core/test/utils/matrix_generator.hpp   |  9 +++--
 dpcpp/matrix/batch_ell_kernels.hpp.inc | 10 +++--
 6 files changed, 89 insertions(+), 26 deletions(-)

diff --git a/core/base/batch_utilities.hpp b/core/base/batch_utilities.hpp
index 7204c78a552..3117b35d0f4 100644
--- a/core/base/batch_utilities.hpp
+++ b/core/base/batch_utilities.hpp
@@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -46,6 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/matrix_data.hpp>
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/base/utils_helper.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
 
 
 namespace gko {
@@ -126,6 +128,36 @@ auto unbatch(const InputType* batch_object)
 }
 
 
+namespace detail {
+
+
+template <typename ValueType, typename IndexType>
+void assert_same_sparsity_in_batched_data(
+    const std::vector<gko::matrix_data<ValueType, IndexType>>& data)
+{
+    auto num_nnz = data[0].nonzeros.size();
+    auto base_data = data[0];
+    base_data.ensure_row_major_order();
+    for (int b = 0; b < data.size(); ++b) {
+        if (data[b].nonzeros.size() != num_nnz) {
+            GKO_NOT_IMPLEMENTED;
+        }
+        auto temp_data = data[b];
+        temp_data.ensure_row_major_order();
+        for (int nnz = 0; nnz < num_nnz; ++nnz) {
+            if (temp_data.nonzeros[nnz].row != base_data.nonzeros[nnz].row ||
+                temp_data.nonzeros[nnz].column !=
+                    base_data.nonzeros[nnz].column) {
+                GKO_NOT_IMPLEMENTED;
+            }
+        }
+    }
+}
+
+
+}  // namespace detail
+
+
 template <typename ValueType, typename IndexType, typename OutputType,
           typename... TArgs>
 std::unique_ptr<OutputType> read(
@@ -134,6 +166,12 @@ std::unique_ptr<OutputType> read(
     TArgs&&... create_args)
 {
     auto num_batch_items = data.size();
+    // Throw if all the items in the batch dont have same sparsity.
+    if (!std::is_same<OutputType,
+                      gko::batch::matrix::Dense<ValueType>>::value &&
+        !std::is_same<OutputType, gko::batch::MultiVector<ValueType>>::value) {
+        detail::assert_same_sparsity_in_batched_data(data);
+    }
     auto tmp =
         OutputType::create(exec, batch_dim<2>(num_batch_items, data[0].size),
                            std::forward<TArgs>(create_args)...);
@@ -163,7 +201,8 @@ std::vector<gko::matrix_data<ValueType, IndexType>> write(
 
 
 /**
- * Creates and initializes a batch of single column-vectors.
+ * Creates and initializes a batch of the specified Matrix type with a single
+ * column-vector.
  *
  * @tparam Matrix  matrix type to initialize (It has to implement the
  *                 read<Matrix> function)
@@ -278,15 +317,16 @@ std::unique_ptr<Matrix> initialize(
 
 
 /**
- * Creates and initializes a batch single column-vector by making copies of the
- * single input column vector.
+ * Creates and initializes a batch of specified Matrix type with a single
+ * column-vector by making copies of the single input column vector.
  *
  * @tparam Matrix  matrix type to initialize (It has to implement the
  *                 read<Matrix> function)
  * @tparam TArgs  argument types for Matrix::create method
  *                (not including the implied Executor as the first argument)
  *
- * @param num_vectors  The number of times the input vector is to be duplicated
+ * @param num_batch_items  The number of times the input vector is to be
+ *                         duplicated
  * @param vals  values used to initialize each vector in the temp. batch
  * @param exec  Executor associated with the matrix
  * @param create_args  additional arguments passed to Matrix::create, not
@@ -297,21 +337,20 @@ std::unique_ptr<Matrix> initialize(
  */
 template <typename Matrix, typename... TArgs>
 std::unique_ptr<Matrix> initialize(
-    const size_type num_vectors,
+    const size_type num_batch_items,
     std::initializer_list<typename Matrix::value_type> vals,
     std::shared_ptr<const Executor> exec, TArgs&&... create_args)
 {
     using value_type = typename Matrix::value_type;
     using index_type = typename Matrix::index_type;
     using mat_data = gko::matrix_data<value_type, index_type>;
-    size_type num_batch_items = num_vectors;
     GKO_THROW_IF_INVALID(num_batch_items > 0 && vals.size() > 0,
                          "Input data is empty");
     auto num_rows = begin(vals) ? vals.size() : 0;
     auto common_size = dim<2>(num_rows, 1);
     auto b_size = batch_dim<2>(num_batch_items, common_size);
     std::vector<mat_data> input_mat_data(num_batch_items, common_size);
-    for (size_type batch = 0; batch < num_vectors; batch++) {
+    for (size_type batch = 0; batch < num_batch_items; batch++) {
         input_mat_data[batch].nonzeros.reserve(num_rows);
         size_type idx = 0;
         for (const auto& elem : vals) {
@@ -334,7 +373,7 @@ std::unique_ptr<Matrix> initialize(
  * @tparam TArgs  argument types for Matrix::create method
  *                (not including the implied Executor as the first argument)
  *
- * @param num_batch_items The number of times the input matrix is duplicated
+ * @param num_batch_items  The number of times the input matrix is duplicated
  * @param vals  values used to initialize each matrix in the temp. batch
  * @param exec  Executor associated to the matrix
  * @param create_args  additional arguments passed to Matrix::create, not
diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp
index a50b2f3e23a..5626860e7ee 100644
--- a/core/matrix/batch_ell.cpp
+++ b/core/matrix/batch_ell.cpp
@@ -206,7 +206,6 @@ template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::apply_impl(const MultiVector<ValueType>* b,
                                            MultiVector<ValueType>* x) const
 {
-    this->validate_application_parameters(b, x);
     this->get_executor()->run(ell::make_simple_apply(this, b, x));
 }
 
@@ -217,7 +216,6 @@ void Ell<ValueType, IndexType>::apply_impl(const MultiVector<ValueType>* alpha,
                                            const MultiVector<ValueType>* beta,
                                            MultiVector<ValueType>* x) const
 {
-    this->validate_application_parameters(alpha, b, beta, x);
     this->get_executor()->run(
         ell::make_advanced_apply(alpha, this, b, beta, x));
 }
diff --git a/core/test/matrix/batch_ell.cpp b/core/test/matrix/batch_ell.cpp
index c36a877ac14..e04ed96bf4c 100644
--- a/core/test/matrix/batch_ell.cpp
+++ b/core/test/matrix/batch_ell.cpp
@@ -360,7 +360,7 @@ TYPED_TEST(Ell, CanBeListConstructed)
     using BatchEllMtx = typename TestFixture::BatchEllMtx;
     using EllMtx = typename TestFixture::EllMtx;
 
-    auto m = gko::batch::initialize<BatchEllMtx>({{0.0, -1.0}, {1.0, 0.0}},
+    auto m = gko::batch::initialize<BatchEllMtx>({{0.0, -1.0}, {0.0, -5.0}},
                                                  this->exec);
 
     ASSERT_EQ(m->get_num_batch_items(), 2);
@@ -369,10 +369,10 @@ TYPED_TEST(Ell, CanBeListConstructed)
     ASSERT_EQ(m->get_num_stored_elements_per_row(), 1);
     EXPECT_EQ(m->get_values()[0], value_type{0.0});
     EXPECT_EQ(m->get_values()[1], value_type{-1.0});
-    EXPECT_EQ(m->get_values()[2], value_type{1.0});
-    EXPECT_EQ(m->get_values()[3], value_type{0.0});
-    EXPECT_EQ(m->get_col_idxs()[0], index_type{0});
-    EXPECT_EQ(m->get_col_idxs()[1], index_type{-1});
+    EXPECT_EQ(m->get_values()[2], value_type{0.0});
+    EXPECT_EQ(m->get_values()[3], value_type{-5.0});
+    EXPECT_EQ(m->get_col_idxs()[0], index_type{-1});
+    EXPECT_EQ(m->get_col_idxs()[1], index_type{0});
 }
 
 
@@ -459,6 +459,28 @@ TYPED_TEST(Ell, CanBeReadFromMatrixData)
 }
 
 
+TYPED_TEST(Ell, CanBeDetectDataWithDifferentSparsity)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+    auto vec_data = std::vector<gko::matrix_data<value_type, index_type>>{};
+    vec_data.emplace_back(
+        gko::matrix_data<value_type, index_type>({2, 3}, {
+                                                             {0, 0, -1.0},
+                                                             {1, 1, 2.5},
+                                                             {1, 2, 0.5},
+                                                             {2, 2, -3.0},
+                                                         }));
+    vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
+        {2, 3}, {{0, 0, 1.0}, {1, 1, 2.0}, {1, 2, 3.0}}));
+
+    EXPECT_THROW(
+        gko::batch::detail::assert_same_sparsity_in_batched_data(vec_data),
+        gko::NotImplemented);
+}
+
+
 TYPED_TEST(Ell, GeneratesCorrectMatrixData)
 {
     using value_type = typename TestFixture::value_type;
diff --git a/core/test/utils/batch_helpers.hpp b/core/test/utils/batch_helpers.hpp
index 0b6197b5062..5b1fa60ed36 100644
--- a/core/test/utils/batch_helpers.hpp
+++ b/core/test/utils/batch_helpers.hpp
@@ -95,9 +95,10 @@ std::unique_ptr<MatrixType> generate_random_batch_matrix(
                         .copy_to_array();
 
     for (size_type b = 0; b < num_batch_items; b++) {
-        auto rand_mat = fill_random_matrix_with_sparsity_pattern<
-            typename MatrixType::unbatch_type, index_type>(
-            num_rows, num_cols, row_idxs, col_idxs, value_dist, engine, exec);
+        auto rand_mat =
+            fill_random_matrix<typename MatrixType::unbatch_type, index_type>(
+                num_rows, num_cols, row_idxs, col_idxs, value_dist, engine,
+                exec);
         result->create_view_for_item(b)->copy_from(rand_mat.get());
     }
 
diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp
index 7490a24bbe5..d5370c6ef6a 100644
--- a/core/test/utils/matrix_generator.hpp
+++ b/core/test/utils/matrix_generator.hpp
@@ -206,23 +206,24 @@ generate_random_device_matrix_data(gko::size_type num_rows,
  * @tparam MatrixType  type of matrix to generate (must implement
  *                     the interface `ReadableFromMatrixData<>` and provide
  *                     matching `value_type` and `index_type` type aliases)
+ * @tparam IndexType  the type for row and column indices
+ * @tparam ValueDistribution  type of value distribution
+ * @tparam Engine  type of random engine
  *
  * @param num_rows  number of rows
  * @param num_cols  number of columns
- * @param value_dist  distribution of matrix values
  * @param row_idxs  the row indices of the matrix
  * @param col_idxs  the column indices of the matrix
+ * @param value_dist  distribution of matrix values
  * @param exec  executor where the matrix should be allocated
  * @param args  additional arguments for the matrix constructor
  *
- * The other (template) parameters match generate_random_matrix_data.
- *
  * @return the unique pointer of MatrixType
  */
 template <typename MatrixType = matrix::Dense<>,
           typename IndexType = typename MatrixType::index_type,
           typename ValueDistribution, typename Engine, typename... MatrixArgs>
-std::unique_ptr<MatrixType> fill_random_matrix_with_sparsity_pattern(
+std::unique_ptr<MatrixType> fill_random_matrix(
     size_type num_rows, size_type num_cols,
     const gko::array<IndexType>& row_idxs,
     const gko::array<IndexType>& col_idxs, ValueDistribution&& value_dist,
diff --git a/dpcpp/matrix/batch_ell_kernels.hpp.inc b/dpcpp/matrix/batch_ell_kernels.hpp.inc
index 8cdb8daa273..64d71710dbb 100644
--- a/dpcpp/matrix/batch_ell_kernels.hpp.inc
+++ b/dpcpp/matrix/batch_ell_kernels.hpp.inc
@@ -44,9 +44,10 @@ __dpct_inline__ void simple_apply_kernel(
             const auto col_idx = mat.col_idxs[tidx + idx * mat.stride];
             if (col_idx == invalid_index<IndexType>()) {
                 break;
-            } else
+            } else {
                 temp += mat.values[tidx + idx * mat.stride] *
                         b.values[col_idx * b.stride];
+            }
         }
         x.values[tidx * x.stride] = temp;
     }
@@ -69,11 +70,12 @@ __dpct_inline__ void advanced_apply_kernel(
             const auto col_idx = mat.col_idxs[tidx + idx * mat.stride];
             if (col_idx == invalid_index<IndexType>()) {
                 break;
-            } else
-                temp += alpha.values[0] * mat.values[tidx + idx * mat.stride] *
+            } else {
+                temp += mat.values[tidx + idx * mat.stride] *
                         b.values[col_idx * b.stride];
+            }
         }
         x.values[tidx * x.stride] =
-            temp + beta.values[0] * x.values[tidx * x.stride];
+            alpha.values[0] * temp + beta.values[0] * x.values[tidx * x.stride];
     }
 }

From 4b1fbc1cd5ecda697967e05bf17a32ffc18a3cb1 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 16 Oct 2023 16:28:44 +0200
Subject: [PATCH 15/18] vector mat data with duplication

---
 core/base/batch_utilities.hpp | 39 ++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/core/base/batch_utilities.hpp b/core/base/batch_utilities.hpp
index 3117b35d0f4..e6a52250565 100644
--- a/core/base/batch_utilities.hpp
+++ b/core/base/batch_utilities.hpp
@@ -349,17 +349,16 @@ std::unique_ptr<Matrix> initialize(
     auto num_rows = begin(vals) ? vals.size() : 0;
     auto common_size = dim<2>(num_rows, 1);
     auto b_size = batch_dim<2>(num_batch_items, common_size);
-    std::vector<mat_data> input_mat_data(num_batch_items, common_size);
-    for (size_type batch = 0; batch < num_batch_items; batch++) {
-        input_mat_data[batch].nonzeros.reserve(num_rows);
-        size_type idx = 0;
-        for (const auto& elem : vals) {
-            if (elem != zero<value_type>()) {
-                input_mat_data[batch].nonzeros.emplace_back(idx, 0, elem);
-            }
-            ++idx;
+    mat_data single_mat_data(common_size);
+    single_mat_data.nonzeros.reserve(num_rows);
+    size_type idx = 0;
+    for (const auto& elem : vals) {
+        if (elem != zero<value_type>()) {
+            single_mat_data.nonzeros.emplace_back(idx, 0, elem);
         }
+        ++idx;
     }
+    std::vector<mat_data> input_mat_data(num_batch_items, single_mat_data);
     return read<value_type, index_type, Matrix>(
         exec, input_mat_data, std::forward<TArgs>(create_args)...);
 }
@@ -397,21 +396,19 @@ std::unique_ptr<Matrix> initialize(
     auto common_size = dim<2>(begin(vals) ? vals.size() : 0,
                               begin(vals) ? begin(vals)->size() : 0);
     batch_dim<2> b_size(num_batch_items, common_size);
-    std::vector<mat_data> input_mat_data(num_batch_items, common_size);
-    for (size_type batch = 0; batch < num_batch_items; batch++) {
-        size_type ridx = 0;
-        for (const auto& row : vals) {
-            size_type cidx = 0;
-            for (const auto& elem : row) {
-                if (elem != zero<value_type>()) {
-                    input_mat_data[batch].nonzeros.emplace_back(ridx, cidx,
-                                                                elem);
-                }
-                ++cidx;
+    mat_data single_mat_data(common_size);
+    size_type ridx = 0;
+    for (const auto& row : vals) {
+        size_type cidx = 0;
+        for (const auto& elem : row) {
+            if (elem != zero<value_type>()) {
+                single_mat_data.nonzeros.emplace_back(ridx, cidx, elem);
             }
-            ++ridx;
+            ++cidx;
         }
+        ++ridx;
     }
+    std::vector<mat_data> input_mat_data(num_batch_items, single_mat_data);
     return read<value_type, index_type, Matrix>(
         exec, input_mat_data, std::forward<TArgs>(create_args)...);
 }

From d231ca60ed109326799dfe4da78860d628d26312 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 16 Oct 2023 21:49:43 +0200
Subject: [PATCH 16/18] Review updates

Co-authored-by: Yu-Hsiang Tsai <yhmtsai@gmail.com>
Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 core/base/batch_utilities.hpp              | 44 +++++++++++++++++-----
 core/matrix/batch_dense.cpp                |  8 ----
 core/matrix/batch_ell.cpp                  | 11 ------
 core/test/matrix/batch_ell.cpp             | 23 ++++++++++-
 include/ginkgo/core/matrix/batch_dense.hpp | 13 +------
 include/ginkgo/core/matrix/batch_ell.hpp   | 17 ++-------
 6 files changed, 62 insertions(+), 54 deletions(-)

diff --git a/core/base/batch_utilities.hpp b/core/base/batch_utilities.hpp
index e6a52250565..febfd59b636 100644
--- a/core/base/batch_utilities.hpp
+++ b/core/base/batch_utilities.hpp
@@ -54,6 +54,9 @@ namespace gko {
 namespace batch {
 
 
+/**
+ * Duplicate a given input batch object.
+ */
 template <typename OutputType, typename... TArgs>
 std::unique_ptr<OutputType> duplicate(std::shared_ptr<const Executor> exec,
                                       size_type num_duplications,
@@ -78,6 +81,9 @@ std::unique_ptr<OutputType> duplicate(std::shared_ptr<const Executor> exec,
 }
 
 
+/**
+ * Duplicate a monolithic matrix and create a batch object.
+ */
 template <typename OutputType, typename... TArgs>
 std::unique_ptr<OutputType> create_from_item(
     std::shared_ptr<const Executor> exec, const size_type num_duplications,
@@ -96,6 +102,13 @@ std::unique_ptr<OutputType> create_from_item(
 }
 
 
+/**
+ * Create a batch object from a vector of monolithic object that share the same
+ * sparsity pattern.
+ *
+ * @note The sparsity of the elements in the input vector of matrices needs to
+ * be the same. TODO: Check for same sparsity among the different input items
+ */
 template <typename OutputType, typename... TArgs>
 std::unique_ptr<OutputType> create_from_item(
     std::shared_ptr<const Executor> exec,
@@ -115,6 +128,9 @@ std::unique_ptr<OutputType> create_from_item(
 }
 
 
+/**
+ * Unbatch a batched object into a vector of items of its unbatch_type.
+ */
 template <typename InputType>
 auto unbatch(const InputType* batch_object)
 {
@@ -135,19 +151,20 @@ template <typename ValueType, typename IndexType>
 void assert_same_sparsity_in_batched_data(
     const std::vector<gko::matrix_data<ValueType, IndexType>>& data)
 {
-    auto num_nnz = data[0].nonzeros.size();
-    auto base_data = data[0];
+    auto num_nnz = data.at(0).nonzeros.size();
+    auto base_data = data.at(0);
     base_data.ensure_row_major_order();
-    for (int b = 0; b < data.size(); ++b) {
+    for (int b = 1; b < data.size(); ++b) {
         if (data[b].nonzeros.size() != num_nnz) {
             GKO_NOT_IMPLEMENTED;
         }
-        auto temp_data = data[b];
+        auto temp_data = data.at(b);
         temp_data.ensure_row_major_order();
         for (int nnz = 0; nnz < num_nnz; ++nnz) {
-            if (temp_data.nonzeros[nnz].row != base_data.nonzeros[nnz].row ||
-                temp_data.nonzeros[nnz].column !=
-                    base_data.nonzeros[nnz].column) {
+            if (temp_data.nonzeros.at(nnz).row !=
+                    base_data.nonzeros.at(nnz).row ||
+                temp_data.nonzeros.at(nnz).column !=
+                    base_data.nonzeros.at(nnz).column) {
                 GKO_NOT_IMPLEMENTED;
             }
         }
@@ -158,6 +175,10 @@ void assert_same_sparsity_in_batched_data(
 }  // namespace detail
 
 
+/**
+ * Create a batch object from a vector of gko::matrix_data objects. Each item of
+ * the vector needs to store the same sparsity pattern.
+ */
 template <typename ValueType, typename IndexType, typename OutputType,
           typename... TArgs>
 std::unique_ptr<OutputType> read(
@@ -173,7 +194,7 @@ std::unique_ptr<OutputType> read(
         detail::assert_same_sparsity_in_batched_data(data);
     }
     auto tmp =
-        OutputType::create(exec, batch_dim<2>(num_batch_items, data[0].size),
+        OutputType::create(exec, batch_dim<2>(num_batch_items, data.at(0).size),
                            std::forward<TArgs>(create_args)...);
 
     for (size_type b = 0; b < num_batch_items; ++b) {
@@ -184,6 +205,9 @@ std::unique_ptr<OutputType> read(
 }
 
 
+/**
+ * Write a vector of matrix data objects from an input batch object.
+ */
 template <typename ValueType, typename IndexType, typename OutputType>
 std::vector<gko::matrix_data<ValueType, IndexType>> write(
     const OutputType* mvec)
@@ -201,8 +225,8 @@ std::vector<gko::matrix_data<ValueType, IndexType>> write(
 
 
 /**
- * Creates and initializes a batch of the specified Matrix type with a single
- * column-vector.
+ * Creates and initializes a batch of the specified Matrix type from a series of
+ * single column-vectors.
  *
  * @tparam Matrix  matrix type to initialize (It has to implement the
  *                 read<Matrix> function)
diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp
index 8390d43fd7d..58c7fa25cea 100644
--- a/core/matrix/batch_dense.cpp
+++ b/core/matrix/batch_dense.cpp
@@ -96,14 +96,6 @@ Dense<ValueType>::create_const_view_for_item(size_type item_id) const
 }
 
 
-template <typename ValueType>
-std::unique_ptr<Dense<ValueType>> Dense<ValueType>::create_with_config_of(
-    ptr_param<const Dense<ValueType>> other)
-{
-    return Dense<ValueType>::create(other->get_executor(), other->get_size());
-}
-
-
 template <typename ValueType>
 std::unique_ptr<const Dense<ValueType>> Dense<ValueType>::create_const(
     std::shared_ptr<const Executor> exec, const batch_dim<2>& sizes,
diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp
index 5626860e7ee..88863a05dd4 100644
--- a/core/matrix/batch_ell.cpp
+++ b/core/matrix/batch_ell.cpp
@@ -100,17 +100,6 @@ Ell<ValueType, IndexType>::create_const_view_for_item(size_type item_id) const
 }
 
 
-template <typename ValueType, typename IndexType>
-std::unique_ptr<Ell<ValueType, IndexType>>
-Ell<ValueType, IndexType>::create_with_config_of(
-    ptr_param<const Ell<ValueType, IndexType>> other)
-{
-    return Ell<ValueType, IndexType>::create(
-        other->get_executor(), other->get_size(),
-        other->get_num_stored_elements_per_row());
-}
-
-
 template <typename ValueType, typename IndexType>
 std::unique_ptr<const Ell<ValueType, IndexType>>
 Ell<ValueType, IndexType>::create_const(
diff --git a/core/test/matrix/batch_ell.cpp b/core/test/matrix/batch_ell.cpp
index e04ed96bf4c..2c8166aa023 100644
--- a/core/test/matrix/batch_ell.cpp
+++ b/core/test/matrix/batch_ell.cpp
@@ -459,7 +459,7 @@ TYPED_TEST(Ell, CanBeReadFromMatrixData)
 }
 
 
-TYPED_TEST(Ell, CanBeDetectDataWithDifferentSparsity)
+TYPED_TEST(Ell, ThrowsForDataWithDifferentNnz)
 {
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::index_type;
@@ -481,6 +481,27 @@ TYPED_TEST(Ell, CanBeDetectDataWithDifferentSparsity)
 }
 
 
+TYPED_TEST(Ell, ThrowsForDataWithDifferentSparsity)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+    auto vec_data = std::vector<gko::matrix_data<value_type, index_type>>{};
+    vec_data.emplace_back(
+        gko::matrix_data<value_type, index_type>({2, 3}, {
+                                                             {0, 0, -1.0},
+                                                             {1, 1, 2.5},
+                                                             {2, 2, -3.0},
+                                                         }));
+    vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
+        {2, 3}, {{0, 0, 1.0}, {1, 1, 2.0}, {1, 2, 3.0}}));
+
+    EXPECT_THROW(
+        gko::batch::detail::assert_same_sparsity_in_batched_data(vec_data),
+        gko::NotImplemented);
+}
+
+
 TYPED_TEST(Ell, GeneratesCorrectMatrixData)
 {
     using value_type = typename TestFixture::value_type;
diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp
index 0b2bcc49166..5a1697afec4 100644
--- a/include/ginkgo/core/matrix/batch_dense.hpp
+++ b/include/ginkgo/core/matrix/batch_dense.hpp
@@ -93,15 +93,6 @@ class Dense final : public EnableBatchLinOp<Dense<ValueType>>,
     using absolute_type = remove_complex<Dense>;
     using complex_type = to_complex<Dense>;
 
-    /**
-     * Creates a Dense matrix with the configuration of another Dense
-     * matrix.
-     *
-     * @param other  The other matrix whose configuration needs to copied.
-     */
-    static std::unique_ptr<Dense> create_with_config_of(
-        ptr_param<const Dense> other);
-
     void convert_to(Dense<next_precision<ValueType>>* result) const override;
 
     void move_to(Dense<next_precision<ValueType>>* result) override;
@@ -233,8 +224,8 @@ class Dense final : public EnableBatchLinOp<Dense<ValueType>>,
      *       significantly more memory efficient than the non-constant version,
      *       so always prefer this version.
      */
-    const value_type* get_const_values_for_item(
-        size_type batch_id) const noexcept
+    const value_type* get_const_values_for_item(size_type batch_id) const
+        noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
         return values_.get_const_data() + this->get_cumulative_offset(batch_id);
diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp
index a6381f90f10..a02d6c81fe8 100644
--- a/include/ginkgo/core/matrix/batch_ell.hpp
+++ b/include/ginkgo/core/matrix/batch_ell.hpp
@@ -98,15 +98,6 @@ class Ell final
     using absolute_type = remove_complex<Ell>;
     using complex_type = to_complex<Ell>;
 
-    /**
-     * Creates a Ell matrix with the configuration of another Ell
-     * matrix.
-     *
-     * @param other  The other matrix whose configuration needs to copied.
-     */
-    static std::unique_ptr<Ell> create_with_config_of(
-        ptr_param<const Ell> other);
-
     void convert_to(
         Ell<next_precision<ValueType>, IndexType>* result) const override;
 
@@ -222,8 +213,8 @@ class Ell final
      *       significantly more memory efficient than the non-constant version,
      *       so always prefer this version.
      */
-    const index_type* get_const_col_idxs_for_item(
-        size_type batch_id) const noexcept
+    const index_type* get_const_col_idxs_for_item(size_type batch_id) const
+        noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
         return col_idxs_.get_const_data();
@@ -251,8 +242,8 @@ class Ell final
      *       significantly more memory efficient than the non-constant version,
      *       so always prefer this version.
      */
-    const value_type* get_const_values_for_item(
-        size_type batch_id) const noexcept
+    const value_type* get_const_values_for_item(size_type batch_id) const
+        noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
         return values_.get_const_data() +

From 48e94bb6b45092f5eaf7bedcd83646a21467f414 Mon Sep 17 00:00:00 2001
From: ginkgo-bot <ginkgo.library@gmail.com>
Date: Mon, 16 Oct 2023 19:52:38 +0000
Subject: [PATCH 17/18] Format files

Co-authored-by: Pratik Nayak <pratikvn@pm.me>
---
 include/ginkgo/core/matrix/batch_dense.hpp | 4 ++--
 include/ginkgo/core/matrix/batch_ell.hpp   | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp
index 5a1697afec4..47230c24e32 100644
--- a/include/ginkgo/core/matrix/batch_dense.hpp
+++ b/include/ginkgo/core/matrix/batch_dense.hpp
@@ -224,8 +224,8 @@ class Dense final : public EnableBatchLinOp<Dense<ValueType>>,
      *       significantly more memory efficient than the non-constant version,
      *       so always prefer this version.
      */
-    const value_type* get_const_values_for_item(size_type batch_id) const
-        noexcept
+    const value_type* get_const_values_for_item(
+        size_type batch_id) const noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
         return values_.get_const_data() + this->get_cumulative_offset(batch_id);
diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp
index a02d6c81fe8..fa00a0631fd 100644
--- a/include/ginkgo/core/matrix/batch_ell.hpp
+++ b/include/ginkgo/core/matrix/batch_ell.hpp
@@ -213,8 +213,8 @@ class Ell final
      *       significantly more memory efficient than the non-constant version,
      *       so always prefer this version.
      */
-    const index_type* get_const_col_idxs_for_item(size_type batch_id) const
-        noexcept
+    const index_type* get_const_col_idxs_for_item(
+        size_type batch_id) const noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
         return col_idxs_.get_const_data();
@@ -242,8 +242,8 @@ class Ell final
      *       significantly more memory efficient than the non-constant version,
      *       so always prefer this version.
      */
-    const value_type* get_const_values_for_item(size_type batch_id) const
-        noexcept
+    const value_type* get_const_values_for_item(
+        size_type batch_id) const noexcept
     {
         GKO_ASSERT(batch_id < this->get_num_batch_items());
         return values_.get_const_data() +

From 0949431736b4ebaeea11d877dca0cc076871273a Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Tue, 17 Oct 2023 10:11:59 +0200
Subject: [PATCH 18/18] Review updates

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 core/base/batch_utilities.hpp |  3 +++
 core/matrix/batch_ell.cpp     | 13 ++-----------
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/core/base/batch_utilities.hpp b/core/base/batch_utilities.hpp
index febfd59b636..b4e380a4162 100644
--- a/core/base/batch_utilities.hpp
+++ b/core/base/batch_utilities.hpp
@@ -151,6 +151,9 @@ template <typename ValueType, typename IndexType>
 void assert_same_sparsity_in_batched_data(
     const std::vector<gko::matrix_data<ValueType, IndexType>>& data)
 {
+    if (data.empty()) {
+        return;
+    }
     auto num_nnz = data.at(0).nonzeros.size();
     auto base_data = data.at(0);
     base_data.ensure_row_major_order();
diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp
index 88863a05dd4..b2987e741d9 100644
--- a/core/matrix/batch_ell.cpp
+++ b/core/matrix/batch_ell.cpp
@@ -147,10 +147,7 @@ const Ell<ValueType, IndexType>* Ell<ValueType, IndexType>::apply(
     ptr_param<const MultiVector<ValueType>> b,
     ptr_param<MultiVector<ValueType>> x) const
 {
-    this->validate_application_parameters(b.get(), x.get());
-    auto exec = this->get_executor();
-    this->apply_impl(make_temporary_clone(exec, b).get(),
-                     make_temporary_clone(exec, x).get());
+    this->apply(b, x);
     return this;
 }
 
@@ -180,13 +177,7 @@ const Ell<ValueType, IndexType>* Ell<ValueType, IndexType>::apply(
     ptr_param<const MultiVector<ValueType>> beta,
     ptr_param<MultiVector<ValueType>> x) const
 {
-    this->validate_application_parameters(alpha.get(), b.get(), beta.get(),
-                                          x.get());
-    auto exec = this->get_executor();
-    this->apply_impl(make_temporary_clone(exec, alpha).get(),
-                     make_temporary_clone(exec, b).get(),
-                     make_temporary_clone(exec, beta).get(),
-                     make_temporary_clone(exec, x).get());
+    this->apply(alpha, b, beta, x);
     return this;
 }