Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support column batch for polynomial division CPU backend #645

Merged
merged 1 commit into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 39 additions & 24 deletions icicle/backend/cpu/src/field/cpu_vec_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -803,12 +803,19 @@ REGISTER_SLICE_EXT_FIELD_BACKEND("CPU", cpu_slice<extension_t>);

/*********************************** Highest non-zero idx ***********************************/
template <typename T>
eIcicleError cpu_highest_non_zero_idx(
const Device& device, const T* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx /*OUT*/)
eIcicleError cpu_highest_non_zero_idx_internal(
const Device& device,
const T* input,
uint64_t size,
const VecOpsConfig& config,
int64_t* out_idx /*OUT*/,
int32_t idx_in_batch_to_calc)
{
ICICLE_ASSERT(input && out_idx && size != 0) << "Error: Invalid argument";
uint64_t stride = config.columns_batch ? config.batch_size : 1;
for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) {
uint32_t start_idx = (idx_in_batch_to_calc == -1) ? 0 : idx_in_batch_to_calc;
uint32_t end_idx = (idx_in_batch_to_calc == -1) ? config.batch_size : idx_in_batch_to_calc + 1;
for (uint64_t idx_in_batch = start_idx; idx_in_batch < end_idx; ++idx_in_batch) {
out_idx[idx_in_batch] = -1; // zero vector is considered '-1' since 0 would be zero in vec[0]
const T* curr_input =
config.columns_batch ? input + idx_in_batch : input + idx_in_batch * size; // Pointer to the current vector
Expand All @@ -822,6 +829,13 @@ eIcicleError cpu_highest_non_zero_idx(
return eIcicleError::SUCCESS;
}

template <typename T>
eIcicleError cpu_highest_non_zero_idx(
const Device& device, const T* input, uint64_t size, const VecOpsConfig& config, int64_t* out_idx /*OUT*/)
{
return cpu_highest_non_zero_idx_internal(device, input, size, config, out_idx, -1);
}

REGISTER_HIGHEST_NON_ZERO_IDX_BACKEND("CPU", cpu_highest_non_zero_idx<scalar_t>);

/*********************************** Polynomial evaluation ***********************************/
Expand Down Expand Up @@ -887,39 +901,40 @@ eIcicleError cpu_poly_divide(
T* r_out /*OUT*/,
uint64_t r_size)
{
if (config.batch_size != 1 && config.columns_batch) {
ICICLE_LOG_ERROR << "polynomial division is not implemented for column batch. Planned for v3.2";
return eIcicleError::API_NOT_IMPLEMENTED;
}

uint32_t stride = config.columns_batch ? config.batch_size : 1;
auto numerator_deg = std::make_unique<int64_t[]>(config.batch_size);
auto denominator_deg = std::make_unique<int64_t[]>(config.batch_size);
auto deg_r = std::make_unique<int64_t[]>(config.batch_size);
cpu_highest_non_zero_idx(device, numerator, numerator_size, config, numerator_deg.get());
cpu_highest_non_zero_idx(device, denominator, denominator_size, config, denominator_deg.get());
memset(r_out, 0, sizeof(T) * (r_size * config.batch_size));
memcpy(r_out, numerator, sizeof(T) * (numerator_size * config.batch_size));

for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; ++idx_in_batch) {
ICICLE_ASSERT(r_size >= numerator_deg[idx_in_batch] + 1)
<< "polynomial division expects r(x) size to be similar to numerator size and higher than numerator "
"degree(x).\nr_size = "
<< r_size << ", numerator_deg[" << idx_in_batch << "] = " << numerator_deg[idx_in_batch];
ICICLE_ASSERT(q_size >= (numerator_deg[idx_in_batch] - denominator_deg[idx_in_batch] + 1))
<< "polynomial division expects q(x) size to be at least deg(numerator)-deg(denominator)+1.\nq_size = " << q_size
<< ", numerator_deg[" << idx_in_batch << "] = " << numerator_deg[idx_in_batch] << ", denominator_deg["
<< idx_in_batch << "] = " << denominator_deg[idx_in_batch];
const T* curr_numerator =
config.columns_batch ? numerator + idx_in_batch : numerator + idx_in_batch * numerator_size;
const T* curr_denominator =
config.columns_batch ? denominator + idx_in_batch : denominator + idx_in_batch * denominator_size;
T* curr_q_out = config.columns_batch ? q_out + idx_in_batch : q_out + idx_in_batch * q_size;
T* curr_r_out = config.columns_batch ? r_out + idx_in_batch : r_out + idx_in_batch * r_size;

int64_t numerator_deg, denominator_deg;
cpu_highest_non_zero_idx(device, curr_numerator, numerator_size, default_vec_ops_config(), &numerator_deg);
cpu_highest_non_zero_idx(device, curr_denominator, denominator_size, default_vec_ops_config(), &denominator_deg);
ICICLE_ASSERT(r_size >= numerator_deg + 1)
<< "polynomial division expects r(x) size to be similar to numerator size and higher than numerator degree(x)";
ICICLE_ASSERT(q_size >= (numerator_deg - denominator_deg + 1))
<< "polynomial division expects q(x) size to be at least deg(numerator)-deg(denominator)+1";

memset(curr_r_out, 0, sizeof(T) * r_size);
memcpy(curr_r_out, curr_numerator, sizeof(T) * (numerator_deg + 1));

// invert largest coeff of b
const T& lc_b_inv = T::inverse(curr_denominator[denominator_deg * stride]);
int64_t deg_r = numerator_deg;
while (deg_r >= denominator_deg) {
const T& lc_b_inv = T::inverse(curr_denominator[denominator_deg[idx_in_batch] * stride]);
deg_r[idx_in_batch] = numerator_deg[idx_in_batch];
while (deg_r[idx_in_batch] >= denominator_deg[idx_in_batch]) {
// each iteration is removing the largest monomial in r until deg(r)<deg(b)
school_book_division_step_cpu(curr_r_out, curr_q_out, curr_denominator, deg_r, denominator_deg, lc_b_inv, stride);
school_book_division_step_cpu(
curr_r_out, curr_q_out, curr_denominator, deg_r[idx_in_batch], denominator_deg[idx_in_batch], lc_b_inv, stride);
// compute degree of r
cpu_highest_non_zero_idx(device, r_out, deg_r, default_vec_ops_config(), &deg_r);
cpu_highest_non_zero_idx_internal(device, r_out, deg_r[idx_in_batch], config, deg_r.get(), idx_in_batch);
}
}
return eIcicleError::SUCCESS;
Expand Down
82 changes: 29 additions & 53 deletions icicle/tests/test_field_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -504,23 +504,6 @@ TYPED_TEST(FieldApiTest, matrixAPIsAsync)
}
};

// Option 1: Initialize each input matrix in the batch with the same ascending values
// for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
// for (uint32_t i = 0; i < R * C; i++) {
// if(columns_batch){
// h_inout[idx_in_batch + batch_size * i] = TypeParam::from(i);
// } else {
// h_inout[idx_in_batch * R * C + i] = TypeParam::from(i);
// }
// }
// }

// Option 2: Initialize the entire input array with ascending values
// for (int i = 0; i < total_size; i++) {
// h_inout[i] = TypeParam::from(i);
// }

// Option 3: Initialize the entire input array with random values
TypeParam::rand_host_many(h_inout.get(), total_size);

// Reference implementation
Expand Down Expand Up @@ -589,23 +572,6 @@ TYPED_TEST(FieldApiTest, bitReverse)
END_TIMER(BIT_REVERSE, oss.str().c_str(), measure);
};

// // Option 1: Initialize each input vector in the batch with the same ascending values
// for (uint32_t idx_in_batch = 0; idx_in_batch < batch_size; idx_in_batch++) {
// for (uint32_t i = 0; i < N; i++) {
// if(columns_batch){
// in_a[idx_in_batch + batch_size * i] = TypeParam::from(i);
// } else {
// in_a[idx_in_batch * N + i] = TypeParam::from(i);
// }
// }
// }

// // Option 2: Initialize the entire input array with ascending values
// for (int i = 0; i < total_size; i++) {
// in_a[i] = TypeParam::from(i);
// }

// Option 3: Initialize the entire input array with random values
FieldApiTest<TypeParam>::random_samples(in_a.get(), total_size);

// Reference implementation
Expand Down Expand Up @@ -797,42 +763,53 @@ TEST_F(FieldApiTestBase, polynomialEval)

TEST_F(FieldApiTestBase, polynomialDivision)
{
const uint64_t numerator_size = 1 << 4;
const uint64_t denominator_size = 1 << 3;
int seed = time(0);
srand(seed);
const uint64_t numerator_size = 1 << (rand() % 3 + 5);
const uint64_t denominator_size = 1 << (rand() % 2 + 3);
const uint64_t q_size = numerator_size - denominator_size + 1;
const uint64_t r_size = numerator_size;
const int batch_size = 10 + rand() % 10;

// basically we compute q(x),r(x) for a(x)=q(x)b(x)+r(x) by dividing a(x)/b(x)

// randomize matrix with rows/cols as polynomials
auto numerator = std::make_unique<scalar_t[]>(numerator_size * batch_size);
auto denominator = std::make_unique<scalar_t[]>(denominator_size * batch_size);
scalar_t::rand_host_many(numerator.get(), numerator_size * batch_size);
scalar_t::rand_host_many(denominator.get(), denominator_size * batch_size);

// Add padding to each row so that the degree is lower than the size
const int zero_pad_length = 5;
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < zero_pad_length; ++j) {
numerator[i * numerator_size + numerator_size - zero_pad_length + j] = scalar_t::zero();
denominator[i * denominator_size + denominator_size - zero_pad_length + j] = scalar_t::zero();
}
}

for (auto device : s_registered_devices) {
ICICLE_CHECK(icicle_set_device(device));
for (int columns_batch = 0; columns_batch <= 1; columns_batch++) {
ICICLE_LOG_DEBUG << "testing polynomial division on device " << device << " [column_batch=" << columns_batch
<< "]";
ICICLE_LOG_INFO << "testing polynomial division on device " << device << " [column_batch=" << columns_batch
<< "]";

// randomize matrix with rows/cols as polynomials
scalar_t::rand_host_many(numerator.get(), numerator_size * batch_size);
scalar_t::rand_host_many(denominator.get(), denominator_size * batch_size);

// Add padding to each vector so that the degree is lower than the size
const int zero_pad_length = 1;
if (columns_batch) {
for (int i = 0; i < batch_size * zero_pad_length; i++) {
numerator[batch_size * numerator_size - batch_size * zero_pad_length + i] = scalar_t::zero();
denominator[batch_size * denominator_size - batch_size * zero_pad_length + i] = scalar_t::zero();
}
} else {
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < zero_pad_length; ++j) {
numerator[i * numerator_size + numerator_size - zero_pad_length + j] = scalar_t::zero();
denominator[i * denominator_size + denominator_size - zero_pad_length + j] = scalar_t::zero();
}
}
}

auto q = std::make_unique<scalar_t[]>(q_size * batch_size);
auto r = std::make_unique<scalar_t[]>(r_size * batch_size);

auto config = default_vec_ops_config();
config.batch_size = columns_batch ? batch_size - zero_pad_length : batch_size; // skip the zero cols
config.batch_size = batch_size;
config.columns_batch = columns_batch;
// TODO v3.2 support column batch for this API
if (columns_batch) {
if (columns_batch && device == "CUDA") {
ICICLE_LOG_INFO << "Skipping polynomial division column batch";
continue;
}
Expand All @@ -853,7 +830,6 @@ TEST_F(FieldApiTestBase, polynomialDivision)
polynomial_eval(r.get(), r_size, &rand_x, 1, config, rx.get());

for (int i = 0; i < config.batch_size; ++i) {
// ICICLE_LOG_DEBUG << "ax=" << ax[i] << ", bx=" << bx[i] << ", qx=" << qx[i] << ", rx=" << rx[i];
ASSERT_EQ(ax[i], qx[i] * bx[i] + rx[i]);
}
}
Expand Down
Loading
Loading