From 516b1b473de34b15995b551a294fe66f9abd87fc Mon Sep 17 00:00:00 2001 From: ss3git <3226388001@jcom.home.ne.jp> Date: Thu, 14 Sep 2023 22:16:32 +0900 Subject: [PATCH 01/18] libsamplerate multi-thread (OpenMP) implementation compiled and tested on -- FreeBSD clang version 14.0.5 (https://github.com/llvm/llvm-project.git llvmorg-14.0.5-0-gc12386ae247c) Target: x86_64-unknown-freebsd13.2 Thread model: posix -- --- CMakeLists.txt | 5 + src/CMakeLists.txt | 5 + src/src_sinc.c | 394 ++++++++++++++++++++++++++++++ tests/multichan_throughput_test.c | 15 ++ tests/throughput_test.c | 15 ++ 5 files changed, 434 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index f5431dab..13cb59ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,6 +72,11 @@ endif() if(CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") option(LIBSAMPLERATE_ENABLE_SANITIZERS "Enable ASAN and UBSAN" OFF) + #if (BUILD_MULTI_THREADING) + set(MULTI_THREADING 1) + add_definitions(-DMULTI_THREADING) + #endif() + if(LIBSAMPLERATE_ENABLE_SANITIZERS) # Use ASAN and UBSAN, make it fail on any error, improve stack traces set(sanitizer_flags -fsanitize=address,undefined -fno-sanitize-recover=all -fno-omit-frame-pointer) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4a0e6c87..6500caf5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -72,6 +72,11 @@ if(LIBM_REQUIRED) target_link_libraries(samplerate PRIVATE m) endif() +if(MULTI_THREADING) + target_link_libraries(samplerate PRIVATE omp) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS_RELEASE} -fopenmp") +endif() + # Set public header set_property(TARGET samplerate PROPERTY PUBLIC_HEADER ${PROJECT_SOURCE_DIR}/include/samplerate.h) diff --git a/src/src_sinc.c b/src/src_sinc.c index 716c4a40..6f757439 100644 --- a/src/src_sinc.c +++ b/src/src_sinc.c @@ -130,6 +130,18 @@ static SRC_STATE_VT sinc_mono_state_vt = sinc_close } ; +#ifdef MULTI_THREADING +static SRC_ERROR sinc_multithread_vari_process (SRC_STATE *state, SRC_DATA *data) ; +static SRC_STATE_VT sinc_multithread_state_vt = +{ + sinc_multithread_vari_process, + sinc_multithread_vari_process, + sinc_reset, + sinc_copy, + sinc_close +} ; +#endif + static inline increment_t double_to_fp (double x) { return (increment_t) (psf_lrint ((x) * FP_ONE)) ; @@ -203,6 +215,384 @@ sinc_get_description (int src_enum) return NULL ; } /* sinc_get_descrition */ +#ifdef MULTI_THREADING + +#include +#include + +#define MULTI_THREADING_THRESHOLD (256) + +__attribute__((always_inline)) static void +calc_output_multi_mt_core(const int NUM_OF_THREADS, const int child_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) +{ + assert(NUM_OF_THREADS == 1 || NUM_OF_THREADS % 2 == 0); + double left[MAX_CHANNELS] = {0}; + double right[MAX_CHANNELS] = {0}; + + /* Convert input parameters into fixed point. */ + const increment_t max_filter_index = int_to_fp(filter->coeff_half_len); + + const int mt_increment_factor = (NUM_OF_THREADS > 1) ? NUM_OF_THREADS / 2 : 1; + const int mt_left_right_sw = child_no % 2; + const int mt_shift = child_no / 2; + + if (!mt_left_right_sw || NUM_OF_THREADS == 1) + { + /* First apply the left half of the filter. */ + increment_t filter_index1 = start_filter_index; + const int coeff_count1 = (max_filter_index - filter_index1) / increment; + filter_index1 = filter_index1 + coeff_count1 * increment; + int data_index1 = filter->b_current - channels * coeff_count1; + + if (data_index1 < 0) /* Avoid underflow access to filter->buffer. */ + { + int steps = int_div_ceil(-data_index1, channels); + /* If the assert triggers we would have to take care not to underflow/overflow */ + assert(steps <= int_div_ceil(filter_index1, increment)); + filter_index1 -= increment * steps; + data_index1 += steps * channels; + } + + filter_index1 -= increment * mt_shift; + data_index1 = data_index1 + channels * mt_shift; + + // left = 0.0; + while (filter_index1 >= MAKE_INCREMENT_T(0)) + { + const double fraction = fp_to_double(filter_index1); + const int indx = fp_to_int(filter_index1); + assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2); + const double icoeff = filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]); + assert(data_index1 >= 0 && data_index1 + channels - 1 < filter->b_len); + assert(data_index1 + channels - 1 < filter->b_end); + for (int ch = 0; ch < channels; ch++) + left[ch] += icoeff * filter->buffer[data_index1 + ch]; + + filter_index1 -= increment * mt_increment_factor; + data_index1 = data_index1 + channels * mt_increment_factor; + }; + } + + if (mt_left_right_sw || NUM_OF_THREADS == 1) + { + /* Now apply the right half of the filter. */ + increment_t filter_index2 = increment - start_filter_index; + const int coeff_count2 = (max_filter_index - filter_index2) / increment; + filter_index2 = filter_index2 + coeff_count2 * increment; + int data_index2 = filter->b_current + channels * (1 + coeff_count2); + // right = 0.0; + + if (!mt_shift) + { + const double fraction = fp_to_double(filter_index2); + const int indx = fp_to_int(filter_index2); + assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2); + const double icoeff = filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]); + assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len); + assert(data_index2 + channels - 1 < filter->b_end); + for (int ch = 0; ch < channels; ch++) + right[ch] += icoeff * filter->buffer[data_index2 + ch]; + } + + filter_index2 -= increment; + data_index2 = data_index2 - channels; + + filter_index2 -= increment * mt_shift; + data_index2 = data_index2 - channels * mt_shift; + + while (filter_index2 > MAKE_INCREMENT_T(0)) + { + const double fraction = fp_to_double(filter_index2); + const int indx = fp_to_int(filter_index2); + assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2); + const double icoeff = filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]); + assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len); + assert(data_index2 + channels - 1 < filter->b_end); + for (int ch = 0; ch < channels; ch++) + right[ch] += icoeff * filter->buffer[data_index2 + ch]; + + filter_index2 -= increment * mt_increment_factor; + data_index2 = data_index2 - channels * mt_increment_factor; + } + } + + for (int ch = 0; ch < channels; ch++) + output[ch] = (scale * (left[ch] + right[ch])); // double +} /* calc_output_stereo */ + +__attribute__((always_inline)) static void +calc_output_multi_mt_2(const int NUM_OF_THREADS, const int child_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) +{ +#define OPTIMIZE_LINE(x) \ + case (x): \ + calc_output_multi_mt_core(NUM_OF_THREADS, child_no, filter, increment, start_filter_index, x, scale, output); \ + break; + + switch (channels) + { + OPTIMIZE_LINE(1); + OPTIMIZE_LINE(2); + OPTIMIZE_LINE(3); + OPTIMIZE_LINE(4); + OPTIMIZE_LINE(5); + OPTIMIZE_LINE(6); + OPTIMIZE_LINE(7); + OPTIMIZE_LINE(8); + OPTIMIZE_LINE(9); + OPTIMIZE_LINE(10); + OPTIMIZE_LINE(11); + OPTIMIZE_LINE(12); + OPTIMIZE_LINE(13); + OPTIMIZE_LINE(14); + OPTIMIZE_LINE(15); + OPTIMIZE_LINE(16); + default: + calc_output_multi_mt_core(NUM_OF_THREADS, child_no, filter, increment, start_filter_index, channels, scale, output); + break; + } +#undef OPTIMIZE_LINE +} + +__attribute__((always_inline)) static void +calc_output_multi_mt(const int NUM_OF_THREADS, const int child_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) +{ +#define OPTIMIZE_LINE(x) \ + case (x): \ + calc_output_multi_mt_2(x, child_no, filter, increment, start_filter_index, channels, scale, output); \ + break; + + switch (NUM_OF_THREADS) + { + OPTIMIZE_LINE(1); + OPTIMIZE_LINE(2); + OPTIMIZE_LINE(4); + OPTIMIZE_LINE(6); + OPTIMIZE_LINE(8); + OPTIMIZE_LINE(10); + OPTIMIZE_LINE(12); + OPTIMIZE_LINE(14); + OPTIMIZE_LINE(16); + default: + calc_output_multi_mt_2(NUM_OF_THREADS, child_no, filter, increment, start_filter_index, channels, scale, output); + break; + } +#undef OPTIMIZE_LINE +} + +static SRC_ERROR +_sinc_multichan_vari_process_mt(const int NUM_OF_THREADS, const int child_no, SRC_STATE *const state, SRC_DATA *const data) +{ + SINC_FILTER *filter = (SINC_FILTER *)state->private_data; + double input_index, src_ratio, count, float_increment, terminate, rem; + increment_t increment, start_filter_index; + int half_filter_chan_len, samples_in_hand; + + if (state->private_data == NULL) + return SRC_ERR_NO_PRIVATE; + + /* If there is not a problem, this will be optimised out. */ + if (sizeof(filter->buffer[0]) != sizeof(data->data_in[0])) + return SRC_ERR_SIZE_INCOMPATIBILITY; + + const int channels = state->channels; + filter->in_count = data->input_frames * channels; + filter->out_count = data->output_frames * channels; + filter->in_used = filter->out_gen = 0; + + src_ratio = state->last_ratio; + + if (is_bad_src_ratio(src_ratio)) + return SRC_ERR_BAD_INTERNAL_STATE; + + /* Check the sample rate ratio wrt the buffer len. */ + count = (filter->coeff_half_len + 2.0) / filter->index_inc; + if (MIN(state->last_ratio, data->src_ratio) < 1.0) + count /= MIN(state->last_ratio, data->src_ratio); + + /* Maximum coefficientson either side of center point. */ + half_filter_chan_len = state->channels * (int)(psf_lrint(count) + 1); + + input_index = state->last_position; + + rem = fmod_one(input_index); + filter->b_current = (filter->b_current + channels * psf_lrint(input_index - rem)) % filter->b_len; + input_index = rem; + + terminate = 1.0 / src_ratio + 1e-20; + + double *const data_out = (double *const)data->data_out; + const int out_count = filter->out_count; + const int index_inc = filter->index_inc; + + /* Main processing loop. */ + while (filter->out_gen < out_count) + { + /* Need to reload buffer? */ + samples_in_hand = (filter->b_end - filter->b_current + filter->b_len) % filter->b_len; + + if (samples_in_hand <= half_filter_chan_len) + { + if ((state->error = prepare_data(filter, channels, data, half_filter_chan_len)) != 0) + return state->error; + + samples_in_hand = (filter->b_end - filter->b_current + filter->b_len) % filter->b_len; + if (samples_in_hand <= half_filter_chan_len) + break; + }; + + /* This is the termination condition. */ + if (filter->b_real_end >= 0) + { + if (filter->b_current + input_index + terminate >= filter->b_real_end) + break; + }; + + if (out_count > 0 && fabs(state->last_ratio - data->src_ratio) > 1e-10) + src_ratio = state->last_ratio + filter->out_gen * (data->src_ratio - state->last_ratio) / out_count; + + float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0); + increment = double_to_fp(float_increment); + + start_filter_index = double_to_fp(input_index * float_increment); + + calc_output_multi_mt(NUM_OF_THREADS, child_no, filter, increment, start_filter_index, channels, float_increment / index_inc, data_out + filter->out_gen); + filter->out_gen += channels; + + /* Figure out the next index. */ + input_index += 1.0 / src_ratio; + rem = fmod_one(input_index); + + filter->b_current = (filter->b_current + channels * psf_lrint(input_index - rem)) % filter->b_len; + input_index = rem; + }; + + state->last_position = input_index; + + /* Save current ratio rather then target ratio. */ + state->last_ratio = src_ratio; + + data->input_frames_used = filter->in_used / state->channels; + data->output_frames_gen = filter->out_gen / state->channels; + + return SRC_ERR_NO_ERROR; +} + +static SRC_ERROR +sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) +{ + if (state->private_data == NULL) + return SRC_ERR_NO_PRIVATE; + + const long in_count = data->input_frames * state->channels; + const long out_count = data->output_frames * state->channels; + + SINC_FILTER *filter = (SINC_FILTER *)state->private_data; + const int filter_buffer_len = (filter->b_len + state->channels); + + const int should_be_single_thread = (sysconf(_SC_NPROCESSORS_ONLN) < 2 || in_count < MULTI_THREADING_THRESHOLD); + + const int NUM_OF_THREADS = should_be_single_thread ? 1 : (sysconf(_SC_NPROCESSORS_ONLN) / 2 * 2); + + SRC_STATE *per_thread_state = (SRC_STATE *)malloc(NUM_OF_THREADS * sizeof(SRC_STATE)); + SRC_DATA *per_thread_data = (SRC_DATA *)malloc(NUM_OF_THREADS * sizeof(SRC_DATA)); + SINC_FILTER *per_thread_filter = (SINC_FILTER *)malloc(NUM_OF_THREADS * sizeof(SINC_FILTER)); + float **per_thread_buffer = (float **)malloc(NUM_OF_THREADS * sizeof(float *)); + double **per_thread_data_out = (double **)malloc(NUM_OF_THREADS * sizeof(double *)); + SRC_ERROR *per_thread_retval = (SRC_ERROR *)malloc(NUM_OF_THREADS * sizeof(SRC_ERROR)); + + SRC_ERROR retval; + + if (NUM_OF_THREADS == 1) + { + per_thread_data_out[0] = (double *)malloc(out_count * sizeof(double)); + void *tmp_data_out = data->data_out; + + data->data_out = (void *)per_thread_data_out[0]; + + per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, state, data); + + data->data_out = tmp_data_out; + + for (int count = 0; count < filter->out_gen; count++) + { + data->data_out[count] = per_thread_data_out[0][count]; + } + + free(per_thread_data_out[0]); + per_thread_data_out[0] = NULL; + + goto cleanup_and_return; + } + + // OpenMP + omp_set_num_threads(NUM_OF_THREADS); + +#pragma omp parallel for + for (int child_no = 0; child_no < NUM_OF_THREADS; child_no++) + { + memcpy(&per_thread_data[child_no], data, sizeof(SRC_DATA)); + memcpy(&per_thread_filter[child_no], filter, sizeof(SINC_FILTER)); + + memcpy(&per_thread_state[child_no], state, sizeof(SRC_STATE)); + per_thread_state[child_no].private_data = &per_thread_filter[child_no]; + + per_thread_buffer[child_no] = (float *)malloc(filter_buffer_len * sizeof(float)); + memcpy(per_thread_buffer[child_no], filter->buffer, filter_buffer_len * sizeof(float)); + per_thread_filter[child_no].buffer = per_thread_buffer[child_no]; + + per_thread_data_out[child_no] = (double *)malloc(out_count * sizeof(double)); + per_thread_data[child_no].data_out = (void *)per_thread_data_out[child_no]; + + per_thread_retval[child_no] = _sinc_multichan_vari_process_mt(NUM_OF_THREADS, child_no, + &per_thread_state[child_no], &per_thread_data[child_no]); + } + + memcpy(filter->buffer, per_thread_buffer[0], filter_buffer_len * sizeof(float)); + + float *buf = filter->buffer; + memcpy(filter, &per_thread_filter[0], sizeof(SINC_FILTER)); + filter->buffer = buf; + + memcpy(state, &per_thread_state[0], sizeof(SRC_STATE)); + state->private_data = filter; + + float *d_out = data->data_out; + memcpy(data, &per_thread_data[0], sizeof(SRC_DATA)); + data->data_out = d_out; + +#pragma omp parallel for + for (int count = 0; count < filter->out_gen; count++) + { + double sum = 0.0; + for (int child_no = 0; child_no < NUM_OF_THREADS; child_no++) + { + sum += per_thread_data_out[child_no][count]; + } + data->data_out[count] = (float)sum; + } + +#pragma omp parallel for + for (int child_no = 0; child_no < NUM_OF_THREADS; child_no++) + { + free(per_thread_buffer[child_no]); + free(per_thread_data_out[child_no]); + } + +cleanup_and_return: + retval = per_thread_retval[0]; + + free(per_thread_state); + free(per_thread_data); + free(per_thread_filter); + free(per_thread_buffer); + free(per_thread_retval); + free(per_thread_data_out); + + return retval; +} + +#endif /* MULTI_THREADING*/ + static SINC_FILTER * sinc_filter_new (int converter_type, int channels) { @@ -282,6 +672,9 @@ sinc_state_new (int converter_type, int channels, SRC_ERROR *error) state->channels = channels ; state->mode = SRC_MODE_PROCESS ; + #ifdef MULTI_THREADING + state->vt = &sinc_multithread_state_vt ; + #else if (state->channels == 1) state->vt = &sinc_mono_state_vt ; else if (state->channels == 2) @@ -292,6 +685,7 @@ sinc_state_new (int converter_type, int channels, SRC_ERROR *error) state->vt = &sinc_hex_state_vt ; else state->vt = &sinc_multichan_state_vt ; + #endif state->private_data = sinc_filter_new (converter_type, state->channels) ; if (!state->private_data) diff --git a/tests/multichan_throughput_test.c b/tests/multichan_throughput_test.c index 5cab44a1..9cf5639b 100644 --- a/tests/multichan_throughput_test.c +++ b/tests/multichan_throughput_test.c @@ -41,7 +41,11 @@ static float output [BUFFER_LEN] ; static void throughput_test (int converter, int channels, long *best_throughput) { SRC_DATA src_data ; +#ifdef MULTI_THREADING + struct timespec start_gettime, finish_gettime; +#else clock_t start_time, clock_time ; +#endif double duration ; long total_frames = 0, throughput ; int error ; @@ -63,7 +67,11 @@ throughput_test (int converter, int channels, long *best_throughput) sleep (2) ; #endif +#ifdef MULTI_THREADING + clock_gettime(CLOCK_MONOTONIC, &start_gettime); +#else start_time = clock () ; +#endif do { @@ -74,8 +82,15 @@ throughput_test (int converter, int channels, long *best_throughput) total_frames += src_data.output_frames_gen ; +#ifdef MULTI_THREADING + clock_gettime(CLOCK_MONOTONIC, &finish_gettime); + + duration = (finish_gettime.tv_sec - start_gettime.tv_sec); + duration += (finish_gettime.tv_nsec - start_gettime.tv_nsec) / 1000000000.0; +#else clock_time = clock () - start_time ; duration = (1.0 * clock_time) / CLOCKS_PER_SEC ; +#endif } while (duration < 5.0) ; diff --git a/tests/throughput_test.c b/tests/throughput_test.c index e9974800..0a6aa834 100644 --- a/tests/throughput_test.c +++ b/tests/throughput_test.c @@ -38,7 +38,11 @@ static float output [BUFFER_LEN] ; static long throughput_test (int converter, long best_throughput) { SRC_DATA src_data ; +#ifdef MULTI_THREADING + struct timespec start_gettime, finish_gettime; +#else clock_t start_time, clock_time ; +#endif double duration ; long total_frames = 0, throughput ; int error ; @@ -60,7 +64,11 @@ throughput_test (int converter, long best_throughput) sleep (2) ; #endif +#ifdef MULTI_THREADING + clock_gettime(CLOCK_MONOTONIC, &start_gettime); +#else start_time = clock () ; +#endif do { @@ -71,11 +79,18 @@ throughput_test (int converter, long best_throughput) total_frames += src_data.output_frames_gen ; +#ifdef MULTI_THREADING + clock_gettime(CLOCK_MONOTONIC, &finish_gettime); + + duration = (finish_gettime.tv_sec - start_gettime.tv_sec); + duration += (finish_gettime.tv_nsec - start_gettime.tv_nsec) / 1000000000.0; +#else clock_time = clock () - start_time ; #ifdef __GNU__ /* Clock resolution is 10ms on GNU/Hurd */ duration = (10000.0 * clock_time) / CLOCKS_PER_SEC ; #else duration = (1.0 * clock_time) / CLOCKS_PER_SEC ; +#endif #endif } while (duration < 3.0) ; From df1aaae73ca1df4cdc7d72a73a70e40dcade0449 Mon Sep 17 00:00:00 2001 From: ss3git <3226388001@jcom.home.ne.jp> Date: Thu, 14 Sep 2023 22:18:26 +0900 Subject: [PATCH 02/18] MT disable by default. --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 13cb59ca..4a8f2b3f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,10 +72,10 @@ endif() if(CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") option(LIBSAMPLERATE_ENABLE_SANITIZERS "Enable ASAN and UBSAN" OFF) - #if (BUILD_MULTI_THREADING) + if (BUILD_MULTI_THREADING) set(MULTI_THREADING 1) add_definitions(-DMULTI_THREADING) - #endif() + endif() if(LIBSAMPLERATE_ENABLE_SANITIZERS) # Use ASAN and UBSAN, make it fail on any error, improve stack traces From 8c0f67615176546647c9a880039d2a241e95675c Mon Sep 17 00:00:00 2001 From: ss3git <3226388001@jcom.home.ne.jp> Date: Fri, 15 Sep 2023 19:19:26 +0900 Subject: [PATCH 03/18] error handlings and small refactor --- src/src_sinc.c | 155 +++++++++++++++++++++++++++++++------------------ 1 file changed, 97 insertions(+), 58 deletions(-) diff --git a/src/src_sinc.c b/src/src_sinc.c index 6f757439..fa66adbe 100644 --- a/src/src_sinc.c +++ b/src/src_sinc.c @@ -220,23 +220,25 @@ sinc_get_description (int src_enum) #include #include +/* smaller frames are processed in single thread to avoid overheads */ #define MULTI_THREADING_THRESHOLD (256) __attribute__((always_inline)) static void -calc_output_multi_mt_core(const int NUM_OF_THREADS, const int child_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) +calc_output_multi_mt_core(const int num_of_threads, const int child_no, + const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) { - assert(NUM_OF_THREADS == 1 || NUM_OF_THREADS % 2 == 0); + assert(num_of_threads == 1 || num_of_threads % 2 == 0); double left[MAX_CHANNELS] = {0}; double right[MAX_CHANNELS] = {0}; /* Convert input parameters into fixed point. */ const increment_t max_filter_index = int_to_fp(filter->coeff_half_len); - const int mt_increment_factor = (NUM_OF_THREADS > 1) ? NUM_OF_THREADS / 2 : 1; + const int mt_increment_factor = (num_of_threads > 1) ? num_of_threads / 2 : 1; const int mt_left_right_sw = child_no % 2; const int mt_shift = child_no / 2; - if (!mt_left_right_sw || NUM_OF_THREADS == 1) + if (!mt_left_right_sw || num_of_threads == 1) { /* First apply the left half of the filter. */ increment_t filter_index1 = start_filter_index; @@ -273,7 +275,7 @@ calc_output_multi_mt_core(const int NUM_OF_THREADS, const int child_no, const SI }; } - if (mt_left_right_sw || NUM_OF_THREADS == 1) + if (mt_left_right_sw || num_of_threads == 1) { /* Now apply the right half of the filter. */ increment_t filter_index2 = increment - start_filter_index; @@ -321,14 +323,15 @@ calc_output_multi_mt_core(const int NUM_OF_THREADS, const int child_no, const SI } /* calc_output_stereo */ __attribute__((always_inline)) static void -calc_output_multi_mt_2(const int NUM_OF_THREADS, const int child_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) +calc_output_multi_mt_2(const int num_of_threads, const int child_no, + const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) { #define OPTIMIZE_LINE(x) \ case (x): \ - calc_output_multi_mt_core(NUM_OF_THREADS, child_no, filter, increment, start_filter_index, x, scale, output); \ + calc_output_multi_mt_core(num_of_threads, child_no, filter, increment, start_filter_index, x, scale, output); \ break; - switch (channels) + switch (channels) // to kick the compile-time optimizer, channel numbers up to 16 are extracted as constants here. { OPTIMIZE_LINE(1); OPTIMIZE_LINE(2); @@ -347,21 +350,22 @@ calc_output_multi_mt_2(const int NUM_OF_THREADS, const int child_no, const SINC_ OPTIMIZE_LINE(15); OPTIMIZE_LINE(16); default: - calc_output_multi_mt_core(NUM_OF_THREADS, child_no, filter, increment, start_filter_index, channels, scale, output); + calc_output_multi_mt_core(num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, output); break; } #undef OPTIMIZE_LINE } __attribute__((always_inline)) static void -calc_output_multi_mt(const int NUM_OF_THREADS, const int child_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) +calc_output_multi_mt(const int num_of_threads, const int child_no, + const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) { #define OPTIMIZE_LINE(x) \ case (x): \ calc_output_multi_mt_2(x, child_no, filter, increment, start_filter_index, channels, scale, output); \ break; - switch (NUM_OF_THREADS) + switch (num_of_threads) // to kick the compile-time optimizer, the number of threads is extracted as constant here. { OPTIMIZE_LINE(1); OPTIMIZE_LINE(2); @@ -373,14 +377,15 @@ calc_output_multi_mt(const int NUM_OF_THREADS, const int child_no, const SINC_FI OPTIMIZE_LINE(14); OPTIMIZE_LINE(16); default: - calc_output_multi_mt_2(NUM_OF_THREADS, child_no, filter, increment, start_filter_index, channels, scale, output); + calc_output_multi_mt_2(num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, output); break; } #undef OPTIMIZE_LINE } static SRC_ERROR -_sinc_multichan_vari_process_mt(const int NUM_OF_THREADS, const int child_no, SRC_STATE *const state, SRC_DATA *const data) +_sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, double *const per_thread_data_out, + SRC_STATE *const state, SRC_DATA *const data) { SINC_FILTER *filter = (SINC_FILTER *)state->private_data; double input_index, src_ratio, count, float_increment, terminate, rem; @@ -420,8 +425,7 @@ _sinc_multichan_vari_process_mt(const int NUM_OF_THREADS, const int child_no, SR terminate = 1.0 / src_ratio + 1e-20; - double *const data_out = (double *const)data->data_out; - const int out_count = filter->out_count; + const long out_count = filter->out_count; const int index_inc = filter->index_inc; /* Main processing loop. */ @@ -443,7 +447,7 @@ _sinc_multichan_vari_process_mt(const int NUM_OF_THREADS, const int child_no, SR /* This is the termination condition. */ if (filter->b_real_end >= 0) { - if (filter->b_current + input_index + terminate >= filter->b_real_end) + if (filter->b_current + input_index + terminate > filter->b_real_end) break; }; @@ -455,7 +459,7 @@ _sinc_multichan_vari_process_mt(const int NUM_OF_THREADS, const int child_no, SR start_filter_index = double_to_fp(input_index * float_increment); - calc_output_multi_mt(NUM_OF_THREADS, child_no, filter, increment, start_filter_index, channels, float_increment / index_inc, data_out + filter->out_gen); + calc_output_multi_mt(num_of_threads, child_no, filter, increment, start_filter_index, channels, float_increment / index_inc, per_thread_data_out + filter->out_gen); filter->out_gen += channels; /* Figure out the next index. */ @@ -491,62 +495,80 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) const int should_be_single_thread = (sysconf(_SC_NPROCESSORS_ONLN) < 2 || in_count < MULTI_THREADING_THRESHOLD); - const int NUM_OF_THREADS = should_be_single_thread ? 1 : (sysconf(_SC_NPROCESSORS_ONLN) / 2 * 2); + const int num_of_threads = should_be_single_thread ? 1 : (sysconf(_SC_NPROCESSORS_ONLN) / 2 * 2); - SRC_STATE *per_thread_state = (SRC_STATE *)malloc(NUM_OF_THREADS * sizeof(SRC_STATE)); - SRC_DATA *per_thread_data = (SRC_DATA *)malloc(NUM_OF_THREADS * sizeof(SRC_DATA)); - SINC_FILTER *per_thread_filter = (SINC_FILTER *)malloc(NUM_OF_THREADS * sizeof(SINC_FILTER)); - float **per_thread_buffer = (float **)malloc(NUM_OF_THREADS * sizeof(float *)); - double **per_thread_data_out = (double **)malloc(NUM_OF_THREADS * sizeof(double *)); - SRC_ERROR *per_thread_retval = (SRC_ERROR *)malloc(NUM_OF_THREADS * sizeof(SRC_ERROR)); + SRC_STATE *per_thread_state = (SRC_STATE *)malloc(num_of_threads * sizeof(SRC_STATE)); + SRC_DATA *per_thread_data = (SRC_DATA *)malloc(num_of_threads * sizeof(SRC_DATA)); + SINC_FILTER *per_thread_filter = (SINC_FILTER *)malloc(num_of_threads * sizeof(SINC_FILTER)); + SRC_ERROR *per_thread_retval = (SRC_ERROR *)malloc(num_of_threads * sizeof(SRC_ERROR)); + + float **per_thread_buffer = (float **)calloc(num_of_threads, sizeof(float *)); + double **per_thread_data_out = (double **)calloc(num_of_threads, sizeof(double *)); - SRC_ERROR retval; + SRC_ERROR retval = SRC_ERR_MALLOC_FAILED; - if (NUM_OF_THREADS == 1) + if ( !per_thread_state || !per_thread_data || !per_thread_filter + || !per_thread_buffer || !per_thread_data_out || !per_thread_retval ) { - per_thread_data_out[0] = (double *)malloc(out_count * sizeof(double)); - void *tmp_data_out = data->data_out; - - data->data_out = (void *)per_thread_data_out[0]; + goto cleanup_and_return; + } - per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, state, data); + if (num_of_threads == 1) // w/o OpenMP + { + per_thread_data_out[0] = (double *)malloc(out_count * sizeof(double)); + if ( !per_thread_data_out[0] ) goto cleanup_and_return; - data->data_out = tmp_data_out; + per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, per_thread_data_out[0], state, data); for (int count = 0; count < filter->out_gen; count++) { data->data_out[count] = per_thread_data_out[0][count]; } - free(per_thread_data_out[0]); - per_thread_data_out[0] = NULL; + retval = per_thread_retval[0]; goto cleanup_and_return; } // OpenMP - omp_set_num_threads(NUM_OF_THREADS); + omp_set_num_threads(num_of_threads); #pragma omp parallel for - for (int child_no = 0; child_no < NUM_OF_THREADS; child_no++) + for (int child_no = 0; child_no < num_of_threads; child_no++) { + per_thread_buffer[child_no] = (float *)malloc(filter_buffer_len * sizeof(float)); + per_thread_data_out[child_no] = (double *)malloc(out_count * sizeof(double)); + + if ( !per_thread_buffer[child_no] || !per_thread_data_out[child_no] ){ + + per_thread_retval[child_no] = SRC_ERR_MALLOC_FAILED; + + continue; + } + memcpy(&per_thread_data[child_no], data, sizeof(SRC_DATA)); memcpy(&per_thread_filter[child_no], filter, sizeof(SINC_FILTER)); memcpy(&per_thread_state[child_no], state, sizeof(SRC_STATE)); per_thread_state[child_no].private_data = &per_thread_filter[child_no]; - per_thread_buffer[child_no] = (float *)malloc(filter_buffer_len * sizeof(float)); memcpy(per_thread_buffer[child_no], filter->buffer, filter_buffer_len * sizeof(float)); per_thread_filter[child_no].buffer = per_thread_buffer[child_no]; - per_thread_data_out[child_no] = (double *)malloc(out_count * sizeof(double)); - per_thread_data[child_no].data_out = (void *)per_thread_data_out[child_no]; + per_thread_retval[child_no] = _sinc_multichan_vari_process_mt( + num_of_threads, child_no, per_thread_data_out[child_no], + &per_thread_state[child_no], &per_thread_data[child_no]); + } - per_thread_retval[child_no] = _sinc_multichan_vari_process_mt(NUM_OF_THREADS, child_no, - &per_thread_state[child_no], &per_thread_data[child_no]); + // error checking for each worker + for (int child_no = 0; child_no < num_of_threads; child_no++){ + if ( per_thread_retval[child_no] != SRC_ERR_NO_ERROR ){ + retval = per_thread_retval[child_no]; + goto cleanup_and_return; + } } + // update filter status memcpy(filter->buffer, per_thread_buffer[0], filter_buffer_len * sizeof(float)); float *buf = filter->buffer; @@ -556,37 +578,54 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) memcpy(state, &per_thread_state[0], sizeof(SRC_STATE)); state->private_data = filter; - float *d_out = data->data_out; memcpy(data, &per_thread_data[0], sizeof(SRC_DATA)); - data->data_out = d_out; #pragma omp parallel for - for (int count = 0; count < filter->out_gen; count++) + for (int count = 0; count < filter->out_gen; count++) // sum up every worker's result { double sum = 0.0; - for (int child_no = 0; child_no < NUM_OF_THREADS; child_no++) + for (int child_no = 0; child_no < num_of_threads; child_no++) { sum += per_thread_data_out[child_no][count]; } data->data_out[count] = (float)sum; } -#pragma omp parallel for - for (int child_no = 0; child_no < NUM_OF_THREADS; child_no++) + retval = SRC_ERR_NO_ERROR; + +cleanup_and_return: + + if (per_thread_state) + free(per_thread_state); + + if (per_thread_data) + free(per_thread_data); + + if (per_thread_filter) + free(per_thread_filter); + + if (per_thread_retval) + free(per_thread_retval); + + if (per_thread_buffer) { - free(per_thread_buffer[child_no]); - free(per_thread_data_out[child_no]); + for (int child_no = 0; child_no < num_of_threads; child_no++) + { + if (per_thread_buffer[child_no]) + free(per_thread_buffer[child_no]); + } + + free(per_thread_buffer); } -cleanup_and_return: - retval = per_thread_retval[0]; - - free(per_thread_state); - free(per_thread_data); - free(per_thread_filter); - free(per_thread_buffer); - free(per_thread_retval); - free(per_thread_data_out); + if (per_thread_data_out) { + for (int child_no = 0; child_no < num_of_threads; child_no++) { + if (per_thread_data_out[child_no]) + free(per_thread_data_out[child_no]); + } + + free(per_thread_data_out); + } return retval; } From 2799ec346715f6812c46e643b7d08c197d893299 Mon Sep 17 00:00:00 2001 From: ss3git <3226388001@jcom.home.ne.jp> Date: Sat, 16 Sep 2023 20:19:02 +0900 Subject: [PATCH 04/18] some more optimize --- src/src_sinc.c | 91 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 60 insertions(+), 31 deletions(-) diff --git a/src/src_sinc.c b/src/src_sinc.c index fa66adbe..df0e6bc6 100644 --- a/src/src_sinc.c +++ b/src/src_sinc.c @@ -224,7 +224,7 @@ sinc_get_description (int src_enum) #define MULTI_THREADING_THRESHOLD (256) __attribute__((always_inline)) static void -calc_output_multi_mt_core(const int num_of_threads, const int child_no, +calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, const int child_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) { assert(num_of_threads == 1 || num_of_threads % 2 == 0); @@ -264,7 +264,7 @@ calc_output_multi_mt_core(const int num_of_threads, const int child_no, const double fraction = fp_to_double(filter_index1); const int indx = fp_to_int(filter_index1); assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2); - const double icoeff = filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]); + const double icoeff = skip_fraction ? filter->coeffs[indx] : filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]); assert(data_index1 >= 0 && data_index1 + channels - 1 < filter->b_len); assert(data_index1 + channels - 1 < filter->b_end); for (int ch = 0; ch < channels; ch++) @@ -289,7 +289,7 @@ calc_output_multi_mt_core(const int num_of_threads, const int child_no, const double fraction = fp_to_double(filter_index2); const int indx = fp_to_int(filter_index2); assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2); - const double icoeff = filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]); + const double icoeff = skip_fraction ? filter->coeffs[indx] : filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]); assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len); assert(data_index2 + channels - 1 < filter->b_end); for (int ch = 0; ch < channels; ch++) @@ -307,7 +307,7 @@ calc_output_multi_mt_core(const int num_of_threads, const int child_no, const double fraction = fp_to_double(filter_index2); const int indx = fp_to_int(filter_index2); assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2); - const double icoeff = filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]); + const double icoeff = skip_fraction ? filter->coeffs[indx] : filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]); assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len); assert(data_index2 + channels - 1 < filter->b_end); for (int ch = 0; ch < channels; ch++) @@ -323,12 +323,28 @@ calc_output_multi_mt_core(const int num_of_threads, const int child_no, } /* calc_output_stereo */ __attribute__((always_inline)) static void -calc_output_multi_mt_2(const int num_of_threads, const int child_no, +calc_output_multi_mt_3(const int num_of_threads, const int child_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) { -#define OPTIMIZE_LINE(x) \ - case (x): \ - calc_output_multi_mt_core(num_of_threads, child_no, filter, increment, start_filter_index, x, scale, output); \ + + const int skip_fraction = increment == ((increment >> SHIFT_BITS) << SHIFT_BITS) && start_filter_index == ((start_filter_index >> SHIFT_BITS) << SHIFT_BITS) ? 1 : 0; + + if (skip_fraction) + { + calc_output_multi_mt_core(1, num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, output); + } + else + { + calc_output_multi_mt_core(0, num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, output); + } +} + +__attribute__((always_inline)) static void +calc_output_multi_mt_2(const int num_of_threads, const int child_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) +{ +#define OPTIMIZE_LINE(x) \ + case (x): \ + calc_output_multi_mt_3(num_of_threads, child_no, filter, increment, start_filter_index, x, scale, output); \ break; switch (channels) // to kick the compile-time optimizer, channel numbers up to 16 are extracted as constants here. @@ -350,7 +366,7 @@ calc_output_multi_mt_2(const int num_of_threads, const int child_no, OPTIMIZE_LINE(15); OPTIMIZE_LINE(16); default: - calc_output_multi_mt_core(num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, output); + calc_output_multi_mt_3(num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, output); break; } #undef OPTIMIZE_LINE @@ -387,14 +403,11 @@ static SRC_ERROR _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, double *const per_thread_data_out, SRC_STATE *const state, SRC_DATA *const data) { - SINC_FILTER *filter = (SINC_FILTER *)state->private_data; - double input_index, src_ratio, count, float_increment, terminate, rem; - increment_t increment, start_filter_index; - int half_filter_chan_len, samples_in_hand; - if (state->private_data == NULL) return SRC_ERR_NO_PRIVATE; + SINC_FILTER *filter = (SINC_FILTER *)state->private_data; + /* If there is not a problem, this will be optimised out. */ if (sizeof(filter->buffer[0]) != sizeof(data->data_in[0])) return SRC_ERR_SIZE_INCOMPATIBILITY; @@ -404,35 +417,41 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, do filter->out_count = data->output_frames * channels; filter->in_used = filter->out_gen = 0; - src_ratio = state->last_ratio; + double src_ratio = state->last_ratio; if (is_bad_src_ratio(src_ratio)) return SRC_ERR_BAD_INTERNAL_STATE; /* Check the sample rate ratio wrt the buffer len. */ - count = (filter->coeff_half_len + 2.0) / filter->index_inc; + double count = (filter->coeff_half_len + 2.0) / filter->index_inc; if (MIN(state->last_ratio, data->src_ratio) < 1.0) count /= MIN(state->last_ratio, data->src_ratio); /* Maximum coefficientson either side of center point. */ - half_filter_chan_len = state->channels * (int)(psf_lrint(count) + 1); + const int half_filter_chan_len = state->channels * (int)(psf_lrint(count) + 1); - input_index = state->last_position; + double input_index = state->last_position; - rem = fmod_one(input_index); + double rem = fmod_one(input_index); filter->b_current = (filter->b_current + channels * psf_lrint(input_index - rem)) % filter->b_len; input_index = rem; - terminate = 1.0 / src_ratio + 1e-20; + const double terminate = 1.0 / src_ratio + 1e-20; const long out_count = filter->out_count; const int index_inc = filter->index_inc; + const int is_constant_ratio = (state->last_ratio == data->src_ratio) ? 1 : 0; + const double constant_input_index_inc = 1.0 / src_ratio; + const double constant_float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0); + const increment_t constant_increment = double_to_fp(constant_float_increment); + const double constant_scale = constant_float_increment / index_inc; + /* Main processing loop. */ while (filter->out_gen < out_count) { /* Need to reload buffer? */ - samples_in_hand = (filter->b_end - filter->b_current + filter->b_len) % filter->b_len; + int samples_in_hand = (filter->b_end - filter->b_current + filter->b_len) % filter->b_len; if (samples_in_hand <= half_filter_chan_len) { @@ -451,19 +470,29 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, do break; }; - if (out_count > 0 && fabs(state->last_ratio - data->src_ratio) > 1e-10) - src_ratio = state->last_ratio + filter->out_gen * (data->src_ratio - state->last_ratio) / out_count; - - float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0); - increment = double_to_fp(float_increment); - - start_filter_index = double_to_fp(input_index * float_increment); - - calc_output_multi_mt(num_of_threads, child_no, filter, increment, start_filter_index, channels, float_increment / index_inc, per_thread_data_out + filter->out_gen); + double scale, float_increment; + increment_t increment; + if ( !is_constant_ratio ){ + if (out_count > 0 && fabs(state->last_ratio - data->src_ratio) > 1e-10) + src_ratio = state->last_ratio + filter->out_gen * (data->src_ratio - state->last_ratio) / out_count; + + float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0); + increment = double_to_fp(float_increment); + scale = float_increment / index_inc; + } + else{ + float_increment = constant_float_increment; + increment = constant_increment; + scale = constant_scale; + } + + increment_t start_filter_index = double_to_fp(input_index * float_increment); + + calc_output_multi_mt(num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, per_thread_data_out + filter->out_gen); filter->out_gen += channels; /* Figure out the next index. */ - input_index += 1.0 / src_ratio; + input_index += (is_constant_ratio) ? constant_input_index_inc : 1.0 / src_ratio; rem = fmod_one(input_index); filter->b_current = (filter->b_current + channels * psf_lrint(input_index - rem)) % filter->b_len; From 797a807267ce536400fd0fdd42457606c030d445 Mon Sep 17 00:00:00 2001 From: ss3git <3226388001@jcom.home.ne.jp> Date: Sun, 17 Sep 2023 20:41:01 +0900 Subject: [PATCH 05/18] correct indentation --- src/src_sinc.c | 76 ++++++++++++++++++++++++++++---------------------- 1 file changed, 42 insertions(+), 34 deletions(-) diff --git a/src/src_sinc.c b/src/src_sinc.c index df0e6bc6..4c546e64 100644 --- a/src/src_sinc.c +++ b/src/src_sinc.c @@ -441,11 +441,11 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, do const long out_count = filter->out_count; const int index_inc = filter->index_inc; - const int is_constant_ratio = (state->last_ratio == data->src_ratio) ? 1 : 0; - const double constant_input_index_inc = 1.0 / src_ratio; - const double constant_float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0); - const increment_t constant_increment = double_to_fp(constant_float_increment); - const double constant_scale = constant_float_increment / index_inc; + const int is_constant_ratio = (state->last_ratio == data->src_ratio) ? 1 : 0; + const double constant_input_index_inc = 1.0 / src_ratio; + const double constant_float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0); + const increment_t constant_increment = double_to_fp(constant_float_increment); + const double constant_scale = constant_float_increment / index_inc; /* Main processing loop. */ while (filter->out_gen < out_count) @@ -470,21 +470,23 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, do break; }; - double scale, float_increment; - increment_t increment; - if ( !is_constant_ratio ){ - if (out_count > 0 && fabs(state->last_ratio - data->src_ratio) > 1e-10) - src_ratio = state->last_ratio + filter->out_gen * (data->src_ratio - state->last_ratio) / out_count; - - float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0); - increment = double_to_fp(float_increment); - scale = float_increment / index_inc; - } - else{ - float_increment = constant_float_increment; - increment = constant_increment; - scale = constant_scale; - } + double scale, float_increment; + increment_t increment; + if (!is_constant_ratio) + { + if (out_count > 0 && fabs(state->last_ratio - data->src_ratio) > 1e-10) + src_ratio = state->last_ratio + filter->out_gen * (data->src_ratio - state->last_ratio) / out_count; + + float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0); + increment = double_to_fp(float_increment); + scale = float_increment / index_inc; + } + else + { + float_increment = constant_float_increment; + increment = constant_increment; + scale = constant_scale; + } increment_t start_filter_index = double_to_fp(input_index * float_increment); @@ -530,7 +532,7 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) SRC_DATA *per_thread_data = (SRC_DATA *)malloc(num_of_threads * sizeof(SRC_DATA)); SINC_FILTER *per_thread_filter = (SINC_FILTER *)malloc(num_of_threads * sizeof(SINC_FILTER)); SRC_ERROR *per_thread_retval = (SRC_ERROR *)malloc(num_of_threads * sizeof(SRC_ERROR)); - + float **per_thread_buffer = (float **)calloc(num_of_threads, sizeof(float *)); double **per_thread_data_out = (double **)calloc(num_of_threads, sizeof(double *)); @@ -542,10 +544,11 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) goto cleanup_and_return; } - if (num_of_threads == 1) // w/o OpenMP + if (num_of_threads == 1) // w/o OpenMP { per_thread_data_out[0] = (double *)malloc(out_count * sizeof(double)); - if ( !per_thread_data_out[0] ) goto cleanup_and_return; + if (!per_thread_data_out[0]) + goto cleanup_and_return; per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, per_thread_data_out[0], state, data); @@ -568,7 +571,8 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) per_thread_buffer[child_no] = (float *)malloc(filter_buffer_len * sizeof(float)); per_thread_data_out[child_no] = (double *)malloc(out_count * sizeof(double)); - if ( !per_thread_buffer[child_no] || !per_thread_data_out[child_no] ){ + if (!per_thread_buffer[child_no] || !per_thread_data_out[child_no]) + { per_thread_retval[child_no] = SRC_ERR_MALLOC_FAILED; @@ -585,16 +589,18 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) per_thread_filter[child_no].buffer = per_thread_buffer[child_no]; per_thread_retval[child_no] = _sinc_multichan_vari_process_mt( - num_of_threads, child_no, per_thread_data_out[child_no], + num_of_threads, child_no, per_thread_data_out[child_no], &per_thread_state[child_no], &per_thread_data[child_no]); } // error checking for each worker - for (int child_no = 0; child_no < num_of_threads; child_no++){ - if ( per_thread_retval[child_no] != SRC_ERR_NO_ERROR ){ - retval = per_thread_retval[child_no]; - goto cleanup_and_return; - } + for (int child_no = 0; child_no < num_of_threads; child_no++) + { + if (per_thread_retval[child_no] != SRC_ERR_NO_ERROR) + { + retval = per_thread_retval[child_no]; + goto cleanup_and_return; + } } // update filter status @@ -610,7 +616,7 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) memcpy(data, &per_thread_data[0], sizeof(SRC_DATA)); #pragma omp parallel for - for (int count = 0; count < filter->out_gen; count++) // sum up every worker's result + for (int count = 0; count < filter->out_gen; count++) // sum up every worker's result { double sum = 0.0; for (int child_no = 0; child_no < num_of_threads; child_no++) @@ -632,7 +638,7 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) if (per_thread_filter) free(per_thread_filter); - + if (per_thread_retval) free(per_thread_retval); @@ -647,8 +653,10 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) free(per_thread_buffer); } - if (per_thread_data_out) { - for (int child_no = 0; child_no < num_of_threads; child_no++) { + if (per_thread_data_out) + { + for (int child_no = 0; child_no < num_of_threads; child_no++) + { if (per_thread_data_out[child_no]) free(per_thread_data_out[child_no]); } From 9c9959dbbea24b36bf786dda39970272511f29f9 Mon Sep 17 00:00:00 2001 From: ss3git <3226388001@jcom.home.ne.jp> Date: Mon, 13 Nov 2023 06:38:33 +0900 Subject: [PATCH 06/18] - Windows (MSVC) support - eliminate redundant buffer --- CMakeLists.txt | 10 +- src/CMakeLists.txt | 15 ++- src/src_sinc.c | 186 ++++++++++++++++++------------ tests/multichan_throughput_test.c | 6 +- tests/throughput_test.c | 6 +- 5 files changed, 134 insertions(+), 89 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4a8f2b3f..ef2fb6ad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,10 @@ cmake_minimum_required(VERSION 3.1..3.18) +if (BUILD_MULTI_THREADING) + set(MULTI_THREADING 1) + add_definitions(-DMULTI_THREADING) +endif() + # Policies # Include file check macros honor CMAKE_REQUIRED_LIBRARIES, CMake >= 3.12 @@ -72,11 +77,6 @@ endif() if(CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") option(LIBSAMPLERATE_ENABLE_SANITIZERS "Enable ASAN and UBSAN" OFF) - if (BUILD_MULTI_THREADING) - set(MULTI_THREADING 1) - add_definitions(-DMULTI_THREADING) - endif() - if(LIBSAMPLERATE_ENABLE_SANITIZERS) # Use ASAN and UBSAN, make it fail on any error, improve stack traces set(sanitizer_flags -fsanitize=address,undefined -fno-sanitize-recover=all -fno-omit-frame-pointer) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6500caf5..21ce661c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -43,6 +43,16 @@ add_library(samplerate # ALIAS to use if libsamplerate is included from other project with add_subdirectory() add_library(SampleRate::samplerate ALIAS samplerate) +if(MULTI_THREADING) + if(WIN32) + target_link_libraries(samplerate PRIVATE libomp) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /openmp:llvm") + else() + target_link_libraries(samplerate PRIVATE omp) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp") + endif() +endif() + # CMake generates wrong DLL names for MinGW and Cygwin, fix it if(BUILD_SHARED_LIBS AND WIN32) if(LIBSAMPLERATE_COMPATIBLE_NAME) @@ -72,11 +82,6 @@ if(LIBM_REQUIRED) target_link_libraries(samplerate PRIVATE m) endif() -if(MULTI_THREADING) - target_link_libraries(samplerate PRIVATE omp) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS_RELEASE} -fopenmp") -endif() - # Set public header set_property(TARGET samplerate PROPERTY PUBLIC_HEADER ${PROJECT_SOURCE_DIR}/include/samplerate.h) diff --git a/src/src_sinc.c b/src/src_sinc.c index 4c546e64..ce25b3e9 100644 --- a/src/src_sinc.c +++ b/src/src_sinc.c @@ -218,13 +218,20 @@ sinc_get_description (int src_enum) #ifdef MULTI_THREADING #include -#include + +#ifdef _WIN32 + #define ALWAYS_INLINE __forceinline + #include +#else + #define ALWAYS_INLINE __attribute__((always_inline)) static + #include +#endif /* smaller frames are processed in single thread to avoid overheads */ #define MULTI_THREADING_THRESHOLD (256) -__attribute__((always_inline)) static void -calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, const int child_no, +ALWAYS_INLINE void +calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, const int th_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) { assert(num_of_threads == 1 || num_of_threads % 2 == 0); @@ -235,8 +242,8 @@ calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, con const increment_t max_filter_index = int_to_fp(filter->coeff_half_len); const int mt_increment_factor = (num_of_threads > 1) ? num_of_threads / 2 : 1; - const int mt_left_right_sw = child_no % 2; - const int mt_shift = child_no / 2; + const int mt_left_right_sw = th_no % 2; + const int mt_shift = th_no / 2; if (!mt_left_right_sw || num_of_threads == 1) { @@ -320,10 +327,10 @@ calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, con for (int ch = 0; ch < channels; ch++) output[ch] = (scale * (left[ch] + right[ch])); // double -} /* calc_output_stereo */ +} /* calc_output_multi_mt_core */ -__attribute__((always_inline)) static void -calc_output_multi_mt_3(const int num_of_threads, const int child_no, +ALWAYS_INLINE void +calc_output_multi_mt_3(const int num_of_threads, const int th_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) { @@ -331,20 +338,20 @@ calc_output_multi_mt_3(const int num_of_threads, const int child_no, if (skip_fraction) { - calc_output_multi_mt_core(1, num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, output); + calc_output_multi_mt_core(1, num_of_threads, th_no, filter, increment, start_filter_index, channels, scale, output); } else { - calc_output_multi_mt_core(0, num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, output); + calc_output_multi_mt_core(0, num_of_threads, th_no, filter, increment, start_filter_index, channels, scale, output); } } -__attribute__((always_inline)) static void -calc_output_multi_mt_2(const int num_of_threads, const int child_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) +ALWAYS_INLINE void +calc_output_multi_mt_2(const int num_of_threads, const int th_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) { #define OPTIMIZE_LINE(x) \ case (x): \ - calc_output_multi_mt_3(num_of_threads, child_no, filter, increment, start_filter_index, x, scale, output); \ + calc_output_multi_mt_3(num_of_threads, th_no, filter, increment, start_filter_index, x, scale, output); \ break; switch (channels) // to kick the compile-time optimizer, channel numbers up to 16 are extracted as constants here. @@ -366,47 +373,43 @@ calc_output_multi_mt_2(const int num_of_threads, const int child_no, const SINC_ OPTIMIZE_LINE(15); OPTIMIZE_LINE(16); default: - calc_output_multi_mt_3(num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, output); + calc_output_multi_mt_3(num_of_threads, th_no, filter, increment, start_filter_index, channels, scale, output); break; } #undef OPTIMIZE_LINE } -__attribute__((always_inline)) static void -calc_output_multi_mt(const int num_of_threads, const int child_no, +ALWAYS_INLINE void +calc_output_multi_mt(const int num_of_threads, const int th_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) { #define OPTIMIZE_LINE(x) \ case (x): \ - calc_output_multi_mt_2(x, child_no, filter, increment, start_filter_index, channels, scale, output); \ + calc_output_multi_mt_2(x, th_no, filter, increment, start_filter_index, channels, scale, output); \ break; switch (num_of_threads) // to kick the compile-time optimizer, the number of threads is extracted as constant here. { OPTIMIZE_LINE(1); - OPTIMIZE_LINE(2); - OPTIMIZE_LINE(4); OPTIMIZE_LINE(6); - OPTIMIZE_LINE(8); OPTIMIZE_LINE(10); - OPTIMIZE_LINE(12); OPTIMIZE_LINE(14); - OPTIMIZE_LINE(16); default: - calc_output_multi_mt_2(num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, output); + calc_output_multi_mt_2(num_of_threads, th_no, filter, increment, start_filter_index, channels, scale, output); break; } #undef OPTIMIZE_LINE } static SRC_ERROR -_sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, double *const per_thread_data_out, - SRC_STATE *const state, SRC_DATA *const data) +_sinc_multichan_vari_process_mt(const int NUM_OF_THREADS, const int child_no, const int interleave, double *const per_thread_data_out, + SRC_STATE *const state, SRC_DATA *const data, SRC_STATE *const main_state ) { if (state->private_data == NULL) return SRC_ERR_NO_PRIVATE; SINC_FILTER *filter = (SINC_FILTER *)state->private_data; + SINC_FILTER *main_filter = (SINC_FILTER *)main_state->private_data; /* If there is not a problem, this will be optimised out. */ if (sizeof(filter->buffer[0]) != sizeof(data->data_in[0])) @@ -428,7 +431,7 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, do count /= MIN(state->last_ratio, data->src_ratio); /* Maximum coefficientson either side of center point. */ - const int half_filter_chan_len = state->channels * (int)(psf_lrint(count) + 1); + const int half_filter_chan_len = channels * (int)(psf_lrint(count) + 1); double input_index = state->last_position; @@ -448,6 +451,8 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, do const double constant_scale = constant_float_increment / index_inc; /* Main processing loop. */ + int interleave_counter = 0; + const int interleave_mask = interleave - 1; while (filter->out_gen < out_count) { /* Need to reload buffer? */ @@ -455,7 +460,24 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, do if (samples_in_hand <= half_filter_chan_len) { - if ((state->error = prepare_data(filter, channels, data, half_filter_chan_len)) != 0) + // only one buffer is used (shared by all threads) + { + #pragma omp barrier + #pragma omp single + { + state->error = prepare_data(filter, channels, data, half_filter_chan_len); + + *main_state = *state; + *main_filter = *filter; + } + #pragma omp barrier + { + *state = *main_state; + *filter = *main_filter; + } + } + + if (state->error != 0) return state->error; samples_in_hand = (filter->b_end - filter->b_current + filter->b_len) % filter->b_len; @@ -490,7 +512,10 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, do increment_t start_filter_index = double_to_fp(input_index * float_increment); - calc_output_multi_mt(num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, per_thread_data_out + filter->out_gen); + if ( (interleave_counter & interleave_mask) == (child_no & interleave_mask) ){ + calc_output_multi_mt(NUM_OF_THREADS/interleave, child_no/interleave, filter, increment, start_filter_index, channels, scale, per_thread_data_out + filter->out_gen/channels/interleave * channels); + } + interleave_counter = (interleave_counter+1) & interleave_mask; filter->out_gen += channels; /* Figure out the next index. */ @@ -506,8 +531,8 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, do /* Save current ratio rather then target ratio. */ state->last_ratio = src_ratio; - data->input_frames_used = filter->in_used / state->channels; - data->output_frames_gen = filter->out_gen / state->channels; + data->input_frames_used = filter->in_used / channels; + data->output_frames_gen = filter->out_gen / channels; return SRC_ERR_NO_ERROR; } @@ -518,43 +543,65 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) if (state->private_data == NULL) return SRC_ERR_NO_PRIVATE; - const long in_count = data->input_frames * state->channels; - const long out_count = data->output_frames * state->channels; + const int channels = state->channels; + + const long in_count = data->input_frames * channels; + const long out_count = data->output_frames * channels; SINC_FILTER *filter = (SINC_FILTER *)state->private_data; - const int filter_buffer_len = (filter->b_len + state->channels); + const int filter_buffer_len = (filter->b_len + channels); - const int should_be_single_thread = (sysconf(_SC_NPROCESSORS_ONLN) < 2 || in_count < MULTI_THREADING_THRESHOLD); +#ifdef _WIN32 + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + const int N_OF_CORES = sysinfo.dwNumberOfProcessors; +#else + const int N_OF_CORES = sysconf(_SC_NPROCESSORS_ONLN); +#endif - const int num_of_threads = should_be_single_thread ? 1 : (sysconf(_SC_NPROCESSORS_ONLN) / 2 * 2); + const int should_be_single_thread = (N_OF_CORES < 2 || in_count < MULTI_THREADING_THRESHOLD); + const int NUM_OF_THREADS = should_be_single_thread ? 1 : (N_OF_CORES / 2 * 2); - SRC_STATE *per_thread_state = (SRC_STATE *)malloc(num_of_threads * sizeof(SRC_STATE)); - SRC_DATA *per_thread_data = (SRC_DATA *)malloc(num_of_threads * sizeof(SRC_DATA)); - SINC_FILTER *per_thread_filter = (SINC_FILTER *)malloc(num_of_threads * sizeof(SINC_FILTER)); - SRC_ERROR *per_thread_retval = (SRC_ERROR *)malloc(num_of_threads * sizeof(SRC_ERROR)); + int _interleave = 1; + while ( ((NUM_OF_THREADS / _interleave / 2) % 2 == 0 || (NUM_OF_THREADS / _interleave / 2) == 1) + && _interleave * 2 <= NUM_OF_THREADS) + { + _interleave *= 2; + } + + const int interleave = _interleave; + + assert( NUM_OF_THREADS % interleave == 0 ); + assert( (NUM_OF_THREADS / interleave) % 2 == 0 ); + + const int interleave_mask = interleave - 1; + + SRC_STATE *per_thread_state = (SRC_STATE *)malloc(NUM_OF_THREADS * sizeof(SRC_STATE)); + SRC_DATA *per_thread_data = (SRC_DATA *)malloc(NUM_OF_THREADS * sizeof(SRC_DATA)); + SINC_FILTER *per_thread_filter = (SINC_FILTER *)malloc(NUM_OF_THREADS * sizeof(SINC_FILTER)); + SRC_ERROR *per_thread_retval = (SRC_ERROR *)malloc(NUM_OF_THREADS * sizeof(SRC_ERROR)); - float **per_thread_buffer = (float **)calloc(num_of_threads, sizeof(float *)); - double **per_thread_data_out = (double **)calloc(num_of_threads, sizeof(double *)); + double **per_thread_data_out = (double **)calloc(NUM_OF_THREADS, sizeof(double *)); SRC_ERROR retval = SRC_ERR_MALLOC_FAILED; if ( !per_thread_state || !per_thread_data || !per_thread_filter - || !per_thread_buffer || !per_thread_data_out || !per_thread_retval ) + || !per_thread_data_out || !per_thread_retval ) { goto cleanup_and_return; } - if (num_of_threads == 1) // w/o OpenMP + if (NUM_OF_THREADS == 1) // w/o OpenMP { per_thread_data_out[0] = (double *)malloc(out_count * sizeof(double)); if (!per_thread_data_out[0]) goto cleanup_and_return; - per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, per_thread_data_out[0], state, data); + per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, 1, per_thread_data_out[0], state, data, state); for (int count = 0; count < filter->out_gen; count++) { - data->data_out[count] = per_thread_data_out[0][count]; + data->data_out[count] = (float)per_thread_data_out[0][count]; } retval = per_thread_retval[0]; @@ -563,15 +610,17 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) } // OpenMP - omp_set_num_threads(num_of_threads); + omp_set_num_threads(NUM_OF_THREADS); + + int omp_child_no; #pragma omp parallel for - for (int child_no = 0; child_no < num_of_threads; child_no++) + for (omp_child_no = 0; omp_child_no < NUM_OF_THREADS; omp_child_no++) { - per_thread_buffer[child_no] = (float *)malloc(filter_buffer_len * sizeof(float)); - per_thread_data_out[child_no] = (double *)malloc(out_count * sizeof(double)); + const int child_no = omp_child_no; + per_thread_data_out[child_no] = (double *)malloc((out_count/interleave + channels) * sizeof(double)); - if (!per_thread_buffer[child_no] || !per_thread_data_out[child_no]) + if ( !per_thread_data_out[child_no]) { per_thread_retval[child_no] = SRC_ERR_MALLOC_FAILED; @@ -585,16 +634,15 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) memcpy(&per_thread_state[child_no], state, sizeof(SRC_STATE)); per_thread_state[child_no].private_data = &per_thread_filter[child_no]; - memcpy(per_thread_buffer[child_no], filter->buffer, filter_buffer_len * sizeof(float)); - per_thread_filter[child_no].buffer = per_thread_buffer[child_no]; + per_thread_filter[child_no].buffer = filter->buffer; per_thread_retval[child_no] = _sinc_multichan_vari_process_mt( - num_of_threads, child_no, per_thread_data_out[child_no], - &per_thread_state[child_no], &per_thread_data[child_no]); + NUM_OF_THREADS, child_no, interleave, per_thread_data_out[child_no], + &per_thread_state[child_no], &per_thread_data[child_no], state); } // error checking for each worker - for (int child_no = 0; child_no < num_of_threads; child_no++) + for (int child_no = 0; child_no < NUM_OF_THREADS; child_no++) { if (per_thread_retval[child_no] != SRC_ERR_NO_ERROR) { @@ -604,24 +652,27 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) } // update filter status - memcpy(filter->buffer, per_thread_buffer[0], filter_buffer_len * sizeof(float)); - float *buf = filter->buffer; memcpy(filter, &per_thread_filter[0], sizeof(SINC_FILTER)); filter->buffer = buf; memcpy(state, &per_thread_state[0], sizeof(SRC_STATE)); state->private_data = filter; - + memcpy(data, &per_thread_data[0], sizeof(SRC_DATA)); + int omp_count; + #pragma omp parallel for - for (int count = 0; count < filter->out_gen; count++) // sum up every worker's result + for (omp_count = 0; omp_count < filter->out_gen; omp_count++) // sum up every worker's result { + const int count = omp_count; + const int interleave_counter = count/channels; double sum = 0.0; - for (int child_no = 0; child_no < num_of_threads; child_no++) + for (int c_no = 0; c_no < NUM_OF_THREADS/interleave; c_no++) { - sum += per_thread_data_out[child_no][count]; + const int child_no = (c_no * interleave) + (interleave_counter & interleave_mask); + sum += per_thread_data_out[child_no][interleave_counter/interleave * channels]; } data->data_out[count] = (float)sum; } @@ -642,20 +693,9 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) if (per_thread_retval) free(per_thread_retval); - if (per_thread_buffer) - { - for (int child_no = 0; child_no < num_of_threads; child_no++) - { - if (per_thread_buffer[child_no]) - free(per_thread_buffer[child_no]); - } - - free(per_thread_buffer); - } - if (per_thread_data_out) { - for (int child_no = 0; child_no < num_of_threads; child_no++) + for (int child_no = 0; child_no < NUM_OF_THREADS; child_no++) { if (per_thread_data_out[child_no]) free(per_thread_data_out[child_no]); diff --git a/tests/multichan_throughput_test.c b/tests/multichan_throughput_test.c index 9cf5639b..7c88852b 100644 --- a/tests/multichan_throughput_test.c +++ b/tests/multichan_throughput_test.c @@ -41,7 +41,7 @@ static float output [BUFFER_LEN] ; static void throughput_test (int converter, int channels, long *best_throughput) { SRC_DATA src_data ; -#ifdef MULTI_THREADING +#if !defined(_WIN32) && defined(MULTI_THREADING) struct timespec start_gettime, finish_gettime; #else clock_t start_time, clock_time ; @@ -67,7 +67,7 @@ throughput_test (int converter, int channels, long *best_throughput) sleep (2) ; #endif -#ifdef MULTI_THREADING +#if !defined(_WIN32) && defined(MULTI_THREADING) clock_gettime(CLOCK_MONOTONIC, &start_gettime); #else start_time = clock () ; @@ -82,7 +82,7 @@ throughput_test (int converter, int channels, long *best_throughput) total_frames += src_data.output_frames_gen ; -#ifdef MULTI_THREADING +#if !defined(_WIN32) && defined(MULTI_THREADING) clock_gettime(CLOCK_MONOTONIC, &finish_gettime); duration = (finish_gettime.tv_sec - start_gettime.tv_sec); diff --git a/tests/throughput_test.c b/tests/throughput_test.c index 0a6aa834..f4181897 100644 --- a/tests/throughput_test.c +++ b/tests/throughput_test.c @@ -38,7 +38,7 @@ static float output [BUFFER_LEN] ; static long throughput_test (int converter, long best_throughput) { SRC_DATA src_data ; -#ifdef MULTI_THREADING +#if !defined(_WIN32) && defined(MULTI_THREADING) struct timespec start_gettime, finish_gettime; #else clock_t start_time, clock_time ; @@ -64,7 +64,7 @@ throughput_test (int converter, long best_throughput) sleep (2) ; #endif -#ifdef MULTI_THREADING +#if !defined(_WIN32) && defined(MULTI_THREADING) clock_gettime(CLOCK_MONOTONIC, &start_gettime); #else start_time = clock () ; @@ -79,7 +79,7 @@ throughput_test (int converter, long best_throughput) total_frames += src_data.output_frames_gen ; -#ifdef MULTI_THREADING +#if !defined(_WIN32) && defined(MULTI_THREADING) clock_gettime(CLOCK_MONOTONIC, &finish_gettime); duration = (finish_gettime.tv_sec - start_gettime.tv_sec); From 7c3ec94f0d64b45aaa1014781ab27208aea9b7a8 Mon Sep 17 00:00:00 2001 From: ss3git <3226388001@jcom.home.ne.jp> Date: Tue, 14 Nov 2023 20:21:15 +0900 Subject: [PATCH 07/18] ensure the number of threads actually used by omp to have the barrier logic work correctly. --- src/src_sinc.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/src_sinc.c b/src/src_sinc.c index ce25b3e9..5825191d 100644 --- a/src/src_sinc.c +++ b/src/src_sinc.c @@ -221,10 +221,10 @@ sinc_get_description (int src_enum) #ifdef _WIN32 #define ALWAYS_INLINE __forceinline - #include + //#include #else #define ALWAYS_INLINE __attribute__((always_inline)) static - #include + //#include #endif /* smaller frames are processed in single thread to avoid overheads */ @@ -551,13 +551,7 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) SINC_FILTER *filter = (SINC_FILTER *)state->private_data; const int filter_buffer_len = (filter->b_len + channels); -#ifdef _WIN32 - SYSTEM_INFO sysinfo; - GetSystemInfo(&sysinfo); - const int N_OF_CORES = sysinfo.dwNumberOfProcessors; -#else - const int N_OF_CORES = sysconf(_SC_NPROCESSORS_ONLN); -#endif + const int N_OF_CORES = omp_get_num_procs(); const int should_be_single_thread = (N_OF_CORES < 2 || in_count < MULTI_THREADING_THRESHOLD); const int NUM_OF_THREADS = should_be_single_thread ? 1 : (N_OF_CORES / 2 * 2); @@ -591,6 +585,12 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) goto cleanup_and_return; } + // OpenMP + omp_set_dynamic(0); + omp_set_num_threads(NUM_OF_THREADS); + + assert( NUM_OF_THREADS == omp_get_num_threads() ); + if (NUM_OF_THREADS == 1) // w/o OpenMP { per_thread_data_out[0] = (double *)malloc(out_count * sizeof(double)); @@ -609,9 +609,6 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) goto cleanup_and_return; } - // OpenMP - omp_set_num_threads(NUM_OF_THREADS); - int omp_child_no; #pragma omp parallel for From 58c7d5f4aae812d26e9aeb7de8eb89515838b016 Mon Sep 17 00:00:00 2001 From: ss3git <3226388001@jcom.home.ne.jp> Date: Tue, 14 Nov 2023 21:02:19 +0900 Subject: [PATCH 08/18] change threading strategy because overhead of allocating extra work buffer for each thread seems to be larger in fewer channel process cases. --- src/src_sinc.c | 171 +++++++++++-------------------------------------- 1 file changed, 38 insertions(+), 133 deletions(-) diff --git a/src/src_sinc.c b/src/src_sinc.c index 5825191d..cfbbeb5a 100644 --- a/src/src_sinc.c +++ b/src/src_sinc.c @@ -231,21 +231,15 @@ sinc_get_description (int src_enum) #define MULTI_THREADING_THRESHOLD (256) ALWAYS_INLINE void -calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, const int th_no, - const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) +calc_output_multi_mt_core(const int skip_fraction, const SINC_FILTER * const filter, + const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float * const output) { - assert(num_of_threads == 1 || num_of_threads % 2 == 0); double left[MAX_CHANNELS] = {0}; double right[MAX_CHANNELS] = {0}; /* Convert input parameters into fixed point. */ const increment_t max_filter_index = int_to_fp(filter->coeff_half_len); - const int mt_increment_factor = (num_of_threads > 1) ? num_of_threads / 2 : 1; - const int mt_left_right_sw = th_no % 2; - const int mt_shift = th_no / 2; - - if (!mt_left_right_sw || num_of_threads == 1) { /* First apply the left half of the filter. */ increment_t filter_index1 = start_filter_index; @@ -262,9 +256,6 @@ calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, con data_index1 += steps * channels; } - filter_index1 -= increment * mt_shift; - data_index1 = data_index1 + channels * mt_shift; - // left = 0.0; while (filter_index1 >= MAKE_INCREMENT_T(0)) { @@ -277,12 +268,11 @@ calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, con for (int ch = 0; ch < channels; ch++) left[ch] += icoeff * filter->buffer[data_index1 + ch]; - filter_index1 -= increment * mt_increment_factor; - data_index1 = data_index1 + channels * mt_increment_factor; + filter_index1 -= increment; + data_index1 = data_index1 + channels; }; } - if (mt_left_right_sw || num_of_threads == 1) { /* Now apply the right half of the filter. */ increment_t filter_index2 = increment - start_filter_index; @@ -291,7 +281,6 @@ calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, con int data_index2 = filter->b_current + channels * (1 + coeff_count2); // right = 0.0; - if (!mt_shift) { const double fraction = fp_to_double(filter_index2); const int indx = fp_to_int(filter_index2); @@ -306,9 +295,6 @@ calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, con filter_index2 -= increment; data_index2 = data_index2 - channels; - filter_index2 -= increment * mt_shift; - data_index2 = data_index2 - channels * mt_shift; - while (filter_index2 > MAKE_INCREMENT_T(0)) { const double fraction = fp_to_double(filter_index2); @@ -320,38 +306,38 @@ calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, con for (int ch = 0; ch < channels; ch++) right[ch] += icoeff * filter->buffer[data_index2 + ch]; - filter_index2 -= increment * mt_increment_factor; - data_index2 = data_index2 - channels * mt_increment_factor; + filter_index2 -= increment; + data_index2 = data_index2 - channels; } } for (int ch = 0; ch < channels; ch++) - output[ch] = (scale * (left[ch] + right[ch])); // double + output[ch] = (float)(scale * (left[ch] + right[ch])); } /* calc_output_multi_mt_core */ ALWAYS_INLINE void -calc_output_multi_mt_3(const int num_of_threads, const int th_no, - const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) +calc_output_multi_mt_2( + const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output) { const int skip_fraction = increment == ((increment >> SHIFT_BITS) << SHIFT_BITS) && start_filter_index == ((start_filter_index >> SHIFT_BITS) << SHIFT_BITS) ? 1 : 0; if (skip_fraction) { - calc_output_multi_mt_core(1, num_of_threads, th_no, filter, increment, start_filter_index, channels, scale, output); + calc_output_multi_mt_core(1, filter, increment, start_filter_index, channels, scale, output); } else { - calc_output_multi_mt_core(0, num_of_threads, th_no, filter, increment, start_filter_index, channels, scale, output); + calc_output_multi_mt_core(0, filter, increment, start_filter_index, channels, scale, output); } } ALWAYS_INLINE void -calc_output_multi_mt_2(const int num_of_threads, const int th_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) +calc_output_multi_mt(const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output) { #define OPTIMIZE_LINE(x) \ case (x): \ - calc_output_multi_mt_3(num_of_threads, th_no, filter, increment, start_filter_index, x, scale, output); \ + calc_output_multi_mt_2(filter, increment, start_filter_index, x, scale, output); \ break; switch (channels) // to kick the compile-time optimizer, channel numbers up to 16 are extracted as constants here. @@ -373,37 +359,15 @@ calc_output_multi_mt_2(const int num_of_threads, const int th_no, const SINC_FIL OPTIMIZE_LINE(15); OPTIMIZE_LINE(16); default: - calc_output_multi_mt_3(num_of_threads, th_no, filter, increment, start_filter_index, channels, scale, output); - break; - } -#undef OPTIMIZE_LINE -} - -ALWAYS_INLINE void -calc_output_multi_mt(const int num_of_threads, const int th_no, - const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output) -{ -#define OPTIMIZE_LINE(x) \ - case (x): \ - calc_output_multi_mt_2(x, th_no, filter, increment, start_filter_index, channels, scale, output); \ - break; - - switch (num_of_threads) // to kick the compile-time optimizer, the number of threads is extracted as constant here. - { - OPTIMIZE_LINE(1); - OPTIMIZE_LINE(6); - OPTIMIZE_LINE(10); - OPTIMIZE_LINE(14); - default: - calc_output_multi_mt_2(num_of_threads, th_no, filter, increment, start_filter_index, channels, scale, output); + calc_output_multi_mt_2(filter, increment, start_filter_index, channels, scale, output); break; } #undef OPTIMIZE_LINE } static SRC_ERROR -_sinc_multichan_vari_process_mt(const int NUM_OF_THREADS, const int child_no, const int interleave, double *const per_thread_data_out, - SRC_STATE *const state, SRC_DATA *const data, SRC_STATE *const main_state ) +_sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, + SRC_STATE * const state, SRC_DATA * const data, SRC_STATE *const main_state ) { if (state->private_data == NULL) return SRC_ERR_NO_PRIVATE; @@ -452,11 +416,12 @@ _sinc_multichan_vari_process_mt(const int NUM_OF_THREADS, const int child_no, co /* Main processing loop. */ int interleave_counter = 0; - const int interleave_mask = interleave - 1; + float * const data_out = data->data_out; + while (filter->out_gen < out_count) { /* Need to reload buffer? */ - int samples_in_hand = (filter->b_end - filter->b_current + filter->b_len) % filter->b_len; + int samples_in_hand = ( filter->b_end < filter->b_current ) ? (filter->b_end - filter->b_current + filter->b_len) : (filter->b_end - filter->b_current); if (samples_in_hand <= half_filter_chan_len) { @@ -480,7 +445,7 @@ _sinc_multichan_vari_process_mt(const int NUM_OF_THREADS, const int child_no, co if (state->error != 0) return state->error; - samples_in_hand = (filter->b_end - filter->b_current + filter->b_len) % filter->b_len; + samples_in_hand = ( filter->b_end < filter->b_current ) ? (filter->b_end - filter->b_current + filter->b_len) : (filter->b_end - filter->b_current); if (samples_in_hand <= half_filter_chan_len) break; }; @@ -512,17 +477,18 @@ _sinc_multichan_vari_process_mt(const int NUM_OF_THREADS, const int child_no, co increment_t start_filter_index = double_to_fp(input_index * float_increment); - if ( (interleave_counter & interleave_mask) == (child_no & interleave_mask) ){ - calc_output_multi_mt(NUM_OF_THREADS/interleave, child_no/interleave, filter, increment, start_filter_index, channels, scale, per_thread_data_out + filter->out_gen/channels/interleave * channels); + if ( child_no == interleave_counter ){ + calc_output_multi_mt(filter, increment, start_filter_index, channels, scale, data_out + filter->out_gen); } - interleave_counter = (interleave_counter+1) & interleave_mask; + if ( ++interleave_counter == num_of_threads ) interleave_counter = 0; filter->out_gen += channels; /* Figure out the next index. */ input_index += (is_constant_ratio) ? constant_input_index_inc : 1.0 / src_ratio; rem = fmod_one(input_index); - filter->b_current = (filter->b_current + channels * psf_lrint(input_index - rem)) % filter->b_len; + filter->b_current = (filter->b_current + channels * psf_lrint(input_index - rem)); + if ( filter->b_current >= filter->b_len ) filter->b_current -= filter->b_len; input_index = rem; }; @@ -554,55 +520,30 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) const int N_OF_CORES = omp_get_num_procs(); const int should_be_single_thread = (N_OF_CORES < 2 || in_count < MULTI_THREADING_THRESHOLD); - const int NUM_OF_THREADS = should_be_single_thread ? 1 : (N_OF_CORES / 2 * 2); - - int _interleave = 1; - while ( ((NUM_OF_THREADS / _interleave / 2) % 2 == 0 || (NUM_OF_THREADS / _interleave / 2) == 1) - && _interleave * 2 <= NUM_OF_THREADS) - { - _interleave *= 2; - } - - const int interleave = _interleave; - - assert( NUM_OF_THREADS % interleave == 0 ); - assert( (NUM_OF_THREADS / interleave) % 2 == 0 ); - - const int interleave_mask = interleave - 1; + const int num_of_threads = should_be_single_thread ? 1 : (N_OF_CORES / 2 * 2); - SRC_STATE *per_thread_state = (SRC_STATE *)malloc(NUM_OF_THREADS * sizeof(SRC_STATE)); - SRC_DATA *per_thread_data = (SRC_DATA *)malloc(NUM_OF_THREADS * sizeof(SRC_DATA)); - SINC_FILTER *per_thread_filter = (SINC_FILTER *)malloc(NUM_OF_THREADS * sizeof(SINC_FILTER)); - SRC_ERROR *per_thread_retval = (SRC_ERROR *)malloc(NUM_OF_THREADS * sizeof(SRC_ERROR)); - - double **per_thread_data_out = (double **)calloc(NUM_OF_THREADS, sizeof(double *)); + SRC_STATE *per_thread_state = (SRC_STATE *)malloc(num_of_threads * sizeof(SRC_STATE)); + SRC_DATA *per_thread_data = (SRC_DATA *)malloc(num_of_threads * sizeof(SRC_DATA)); + SINC_FILTER *per_thread_filter = (SINC_FILTER *)malloc(num_of_threads * sizeof(SINC_FILTER)); + SRC_ERROR *per_thread_retval = (SRC_ERROR *)malloc(num_of_threads * sizeof(SRC_ERROR)); SRC_ERROR retval = SRC_ERR_MALLOC_FAILED; if ( !per_thread_state || !per_thread_data || !per_thread_filter - || !per_thread_data_out || !per_thread_retval ) + || !per_thread_retval ) { goto cleanup_and_return; } // OpenMP omp_set_dynamic(0); - omp_set_num_threads(NUM_OF_THREADS); + omp_set_num_threads(num_of_threads); - assert( NUM_OF_THREADS == omp_get_num_threads() ); + assert( num_of_threads == omp_get_num_threads() ); - if (NUM_OF_THREADS == 1) // w/o OpenMP + if (num_of_threads == 1) // w/o OpenMP { - per_thread_data_out[0] = (double *)malloc(out_count * sizeof(double)); - if (!per_thread_data_out[0]) - goto cleanup_and_return; - - per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, 1, per_thread_data_out[0], state, data, state); - - for (int count = 0; count < filter->out_gen; count++) - { - data->data_out[count] = (float)per_thread_data_out[0][count]; - } + per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, state, data, state); retval = per_thread_retval[0]; @@ -612,18 +553,9 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) int omp_child_no; #pragma omp parallel for - for (omp_child_no = 0; omp_child_no < NUM_OF_THREADS; omp_child_no++) + for (omp_child_no = 0; omp_child_no < num_of_threads; omp_child_no++) { const int child_no = omp_child_no; - per_thread_data_out[child_no] = (double *)malloc((out_count/interleave + channels) * sizeof(double)); - - if ( !per_thread_data_out[child_no]) - { - - per_thread_retval[child_no] = SRC_ERR_MALLOC_FAILED; - - continue; - } memcpy(&per_thread_data[child_no], data, sizeof(SRC_DATA)); memcpy(&per_thread_filter[child_no], filter, sizeof(SINC_FILTER)); @@ -634,12 +566,12 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) per_thread_filter[child_no].buffer = filter->buffer; per_thread_retval[child_no] = _sinc_multichan_vari_process_mt( - NUM_OF_THREADS, child_no, interleave, per_thread_data_out[child_no], + num_of_threads, child_no, &per_thread_state[child_no], &per_thread_data[child_no], state); } // error checking for each worker - for (int child_no = 0; child_no < NUM_OF_THREADS; child_no++) + for (int child_no = 0; child_no < num_of_threads; child_no++) { if (per_thread_retval[child_no] != SRC_ERR_NO_ERROR) { @@ -658,22 +590,6 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) memcpy(data, &per_thread_data[0], sizeof(SRC_DATA)); - int omp_count; - -#pragma omp parallel for - for (omp_count = 0; omp_count < filter->out_gen; omp_count++) // sum up every worker's result - { - const int count = omp_count; - const int interleave_counter = count/channels; - double sum = 0.0; - for (int c_no = 0; c_no < NUM_OF_THREADS/interleave; c_no++) - { - const int child_no = (c_no * interleave) + (interleave_counter & interleave_mask); - sum += per_thread_data_out[child_no][interleave_counter/interleave * channels]; - } - data->data_out[count] = (float)sum; - } - retval = SRC_ERR_NO_ERROR; cleanup_and_return: @@ -690,17 +606,6 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) if (per_thread_retval) free(per_thread_retval); - if (per_thread_data_out) - { - for (int child_no = 0; child_no < NUM_OF_THREADS; child_no++) - { - if (per_thread_data_out[child_no]) - free(per_thread_data_out[child_no]); - } - - free(per_thread_data_out); - } - return retval; } From 9bce93e7e9373e4643467f5109b05a269ea429f3 Mon Sep 17 00:00:00 2001 From: ss3git <3226388001@jcom.home.ne.jp> Date: Thu, 16 Nov 2023 18:20:14 +0900 Subject: [PATCH 09/18] allow odd numbers of threads prefetch coefft to cache memory --- src/src_sinc.c | 66 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 23 deletions(-) diff --git a/src/src_sinc.c b/src/src_sinc.c index cfbbeb5a..5fc8b8bf 100644 --- a/src/src_sinc.c +++ b/src/src_sinc.c @@ -221,17 +221,26 @@ sinc_get_description (int src_enum) #ifdef _WIN32 #define ALWAYS_INLINE __forceinline - //#include -#else + #include + + #define mem_prefetch(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) + +#elif defined(__GNUC__) || defined(clang) #define ALWAYS_INLINE __attribute__((always_inline)) static - //#include + + #define mem_prefetch(ptr) __builtin_prefetch(ptr) + +#else + #define ALWAYS_INLINE static + + #define mem_prefetch(ptr) #endif /* smaller frames are processed in single thread to avoid overheads */ #define MULTI_THREADING_THRESHOLD (256) ALWAYS_INLINE void -calc_output_multi_mt_core(const int skip_fraction, const SINC_FILTER * const filter, +calc_output_multi_mt_core(const int enable_prefetch, const int skip_fraction, const SINC_FILTER * const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float * const output) { double left[MAX_CHANNELS] = {0}; @@ -240,6 +249,8 @@ calc_output_multi_mt_core(const int skip_fraction, const SINC_FILTER * const fil /* Convert input parameters into fixed point. */ const increment_t max_filter_index = int_to_fp(filter->coeff_half_len); + const int prefetch_increment = 8; + { /* First apply the left half of the filter. */ increment_t filter_index1 = start_filter_index; @@ -259,6 +270,12 @@ calc_output_multi_mt_core(const int skip_fraction, const SINC_FILTER * const fil // left = 0.0; while (filter_index1 >= MAKE_INCREMENT_T(0)) { + if ( enable_prefetch && filter_index1 - increment * prefetch_increment >= MAKE_INCREMENT_T(0) ){ + const int indx = fp_to_int(filter_index1 - increment * prefetch_increment); + mem_prefetch(&filter->coeffs[indx]); + mem_prefetch(&filter->coeffs[indx+1]); + } + const double fraction = fp_to_double(filter_index1); const int indx = fp_to_int(filter_index1); assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2); @@ -281,22 +298,13 @@ calc_output_multi_mt_core(const int skip_fraction, const SINC_FILTER * const fil int data_index2 = filter->b_current + channels * (1 + coeff_count2); // right = 0.0; - { - const double fraction = fp_to_double(filter_index2); - const int indx = fp_to_int(filter_index2); - assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2); - const double icoeff = skip_fraction ? filter->coeffs[indx] : filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]); - assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len); - assert(data_index2 + channels - 1 < filter->b_end); - for (int ch = 0; ch < channels; ch++) - right[ch] += icoeff * filter->buffer[data_index2 + ch]; - } - - filter_index2 -= increment; - data_index2 = data_index2 - channels; + do { + if ( enable_prefetch && filter_index2 - increment * prefetch_increment > MAKE_INCREMENT_T(0) ){ + const int indx = fp_to_int(filter_index2 - increment * prefetch_increment); + mem_prefetch(&filter->coeffs[indx]); + mem_prefetch(&filter->coeffs[indx+1]); + } - while (filter_index2 > MAKE_INCREMENT_T(0)) - { const double fraction = fp_to_double(filter_index2); const int indx = fp_to_int(filter_index2); assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2); @@ -308,7 +316,8 @@ calc_output_multi_mt_core(const int skip_fraction, const SINC_FILTER * const fil filter_index2 -= increment; data_index2 = data_index2 - channels; - } + } while (filter_index2 > MAKE_INCREMENT_T(0)); + } for (int ch = 0; ch < channels; ch++) @@ -321,14 +330,25 @@ calc_output_multi_mt_2( { const int skip_fraction = increment == ((increment >> SHIFT_BITS) << SHIFT_BITS) && start_filter_index == ((start_filter_index >> SHIFT_BITS) << SHIFT_BITS) ? 1 : 0; + const int enable_prefetch = (filter->coeff_half_len > ARRAY_LEN (slow_mid_qual_coeffs.coeffs)); if (skip_fraction) { - calc_output_multi_mt_core(1, filter, increment, start_filter_index, channels, scale, output); + if (enable_prefetch){ + calc_output_multi_mt_core(1, 1, filter, increment, start_filter_index, channels, scale, output); + } + else{ + calc_output_multi_mt_core(0, 1, filter, increment, start_filter_index, channels, scale, output); + } } else { - calc_output_multi_mt_core(0, filter, increment, start_filter_index, channels, scale, output); + if (enable_prefetch){ + calc_output_multi_mt_core(1, 0, filter, increment, start_filter_index, channels, scale, output); + } + else{ + calc_output_multi_mt_core(0, 0, filter, increment, start_filter_index, channels, scale, output); + } } } @@ -520,7 +540,7 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) const int N_OF_CORES = omp_get_num_procs(); const int should_be_single_thread = (N_OF_CORES < 2 || in_count < MULTI_THREADING_THRESHOLD); - const int num_of_threads = should_be_single_thread ? 1 : (N_OF_CORES / 2 * 2); + const int num_of_threads = should_be_single_thread ? 1 : N_OF_CORES; SRC_STATE *per_thread_state = (SRC_STATE *)malloc(num_of_threads * sizeof(SRC_STATE)); SRC_DATA *per_thread_data = (SRC_DATA *)malloc(num_of_threads * sizeof(SRC_DATA)); From 6d3132ad2151a98edbf14e59acead2234a779fb3 Mon Sep 17 00:00:00 2001 From: ss3git <3226388001@jcom.home.ne.jp> Date: Thu, 16 Nov 2023 18:33:30 +0900 Subject: [PATCH 10/18] format style --- src/src_sinc.c | 370 +++++++++++++++++++++++++------------------------ 1 file changed, 189 insertions(+), 181 deletions(-) diff --git a/src/src_sinc.c b/src/src_sinc.c index 5fc8b8bf..3fb77427 100644 --- a/src/src_sinc.c +++ b/src/src_sinc.c @@ -229,7 +229,7 @@ sinc_get_description (int src_enum) #define ALWAYS_INLINE __attribute__((always_inline)) static #define mem_prefetch(ptr) __builtin_prefetch(ptr) - + #else #define ALWAYS_INLINE static @@ -240,8 +240,8 @@ sinc_get_description (int src_enum) #define MULTI_THREADING_THRESHOLD (256) ALWAYS_INLINE void -calc_output_multi_mt_core(const int enable_prefetch, const int skip_fraction, const SINC_FILTER * const filter, - const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float * const output) +calc_output_multi_mt_core(const int enable_prefetch, const int skip_fraction, const SINC_FILTER *const filter, + const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output) { double left[MAX_CHANNELS] = {0}; double right[MAX_CHANNELS] = {0}; @@ -252,76 +252,78 @@ calc_output_multi_mt_core(const int enable_prefetch, const int skip_fraction, co const int prefetch_increment = 8; { - /* First apply the left half of the filter. */ - increment_t filter_index1 = start_filter_index; - const int coeff_count1 = (max_filter_index - filter_index1) / increment; - filter_index1 = filter_index1 + coeff_count1 * increment; - int data_index1 = filter->b_current - channels * coeff_count1; + /* First apply the left half of the filter. */ + increment_t filter_index1 = start_filter_index; + const int coeff_count1 = (max_filter_index - filter_index1) / increment; + filter_index1 = filter_index1 + coeff_count1 * increment; + int data_index1 = filter->b_current - channels * coeff_count1; - if (data_index1 < 0) /* Avoid underflow access to filter->buffer. */ - { - int steps = int_div_ceil(-data_index1, channels); - /* If the assert triggers we would have to take care not to underflow/overflow */ - assert(steps <= int_div_ceil(filter_index1, increment)); - filter_index1 -= increment * steps; - data_index1 += steps * channels; - } + if (data_index1 < 0) /* Avoid underflow access to filter->buffer. */ + { + int steps = int_div_ceil(-data_index1, channels); + /* If the assert triggers we would have to take care not to underflow/overflow */ + assert(steps <= int_div_ceil(filter_index1, increment)); + filter_index1 -= increment * steps; + data_index1 += steps * channels; + } - // left = 0.0; - while (filter_index1 >= MAKE_INCREMENT_T(0)) + // left = 0.0; + while (filter_index1 >= MAKE_INCREMENT_T(0)) + { + if (enable_prefetch && filter_index1 - increment * prefetch_increment >= MAKE_INCREMENT_T(0)) { - if ( enable_prefetch && filter_index1 - increment * prefetch_increment >= MAKE_INCREMENT_T(0) ){ - const int indx = fp_to_int(filter_index1 - increment * prefetch_increment); - mem_prefetch(&filter->coeffs[indx]); - mem_prefetch(&filter->coeffs[indx+1]); - } + const int indx = fp_to_int(filter_index1 - increment * prefetch_increment); + mem_prefetch(&filter->coeffs[indx]); + mem_prefetch(&filter->coeffs[indx + 1]); + } - const double fraction = fp_to_double(filter_index1); - const int indx = fp_to_int(filter_index1); - assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2); - const double icoeff = skip_fraction ? filter->coeffs[indx] : filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]); - assert(data_index1 >= 0 && data_index1 + channels - 1 < filter->b_len); - assert(data_index1 + channels - 1 < filter->b_end); - for (int ch = 0; ch < channels; ch++) - left[ch] += icoeff * filter->buffer[data_index1 + ch]; - - filter_index1 -= increment; - data_index1 = data_index1 + channels; - }; + const double fraction = fp_to_double(filter_index1); + const int indx = fp_to_int(filter_index1); + assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2); + const double icoeff = skip_fraction ? filter->coeffs[indx] : filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]); + assert(data_index1 >= 0 && data_index1 + channels - 1 < filter->b_len); + assert(data_index1 + channels - 1 < filter->b_end); + for (int ch = 0; ch < channels; ch++) + left[ch] += icoeff * filter->buffer[data_index1 + ch]; + + filter_index1 -= increment; + data_index1 = data_index1 + channels; + }; } { - /* Now apply the right half of the filter. */ - increment_t filter_index2 = increment - start_filter_index; - const int coeff_count2 = (max_filter_index - filter_index2) / increment; - filter_index2 = filter_index2 + coeff_count2 * increment; - int data_index2 = filter->b_current + channels * (1 + coeff_count2); - // right = 0.0; - - do { - if ( enable_prefetch && filter_index2 - increment * prefetch_increment > MAKE_INCREMENT_T(0) ){ - const int indx = fp_to_int(filter_index2 - increment * prefetch_increment); - mem_prefetch(&filter->coeffs[indx]); - mem_prefetch(&filter->coeffs[indx+1]); - } - - const double fraction = fp_to_double(filter_index2); - const int indx = fp_to_int(filter_index2); - assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2); - const double icoeff = skip_fraction ? filter->coeffs[indx] : filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]); - assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len); - assert(data_index2 + channels - 1 < filter->b_end); - for (int ch = 0; ch < channels; ch++) - right[ch] += icoeff * filter->buffer[data_index2 + ch]; - - filter_index2 -= increment; - data_index2 = data_index2 - channels; - } while (filter_index2 > MAKE_INCREMENT_T(0)); + /* Now apply the right half of the filter. */ + increment_t filter_index2 = increment - start_filter_index; + const int coeff_count2 = (max_filter_index - filter_index2) / increment; + filter_index2 = filter_index2 + coeff_count2 * increment; + int data_index2 = filter->b_current + channels * (1 + coeff_count2); + // right = 0.0; + + do + { + if (enable_prefetch && filter_index2 - increment * prefetch_increment > MAKE_INCREMENT_T(0)) + { + const int indx = fp_to_int(filter_index2 - increment * prefetch_increment); + mem_prefetch(&filter->coeffs[indx]); + mem_prefetch(&filter->coeffs[indx + 1]); + } + const double fraction = fp_to_double(filter_index2); + const int indx = fp_to_int(filter_index2); + assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2); + const double icoeff = skip_fraction ? filter->coeffs[indx] : filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]); + assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len); + assert(data_index2 + channels - 1 < filter->b_end); + for (int ch = 0; ch < channels; ch++) + right[ch] += icoeff * filter->buffer[data_index2 + ch]; + + filter_index2 -= increment; + data_index2 = data_index2 - channels; + } while (filter_index2 > MAKE_INCREMENT_T(0)); } for (int ch = 0; ch < channels; ch++) - output[ch] = (float)(scale * (left[ch] + right[ch])); + output[ch] = (float)(scale * (left[ch] + right[ch])); } /* calc_output_multi_mt_core */ ALWAYS_INLINE void @@ -330,23 +332,27 @@ calc_output_multi_mt_2( { const int skip_fraction = increment == ((increment >> SHIFT_BITS) << SHIFT_BITS) && start_filter_index == ((start_filter_index >> SHIFT_BITS) << SHIFT_BITS) ? 1 : 0; - const int enable_prefetch = (filter->coeff_half_len > ARRAY_LEN (slow_mid_qual_coeffs.coeffs)); + const int enable_prefetch = (filter->coeff_half_len > ARRAY_LEN(slow_mid_qual_coeffs.coeffs)); if (skip_fraction) { - if (enable_prefetch){ + if (enable_prefetch) + { calc_output_multi_mt_core(1, 1, filter, increment, start_filter_index, channels, scale, output); } - else{ + else + { calc_output_multi_mt_core(0, 1, filter, increment, start_filter_index, channels, scale, output); } } else { - if (enable_prefetch){ + if (enable_prefetch) + { calc_output_multi_mt_core(1, 0, filter, increment, start_filter_index, channels, scale, output); } - else{ + else + { calc_output_multi_mt_core(0, 0, filter, increment, start_filter_index, channels, scale, output); } } @@ -355,49 +361,49 @@ calc_output_multi_mt_2( ALWAYS_INLINE void calc_output_multi_mt(const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output) { -#define OPTIMIZE_LINE(x) \ - case (x): \ - calc_output_multi_mt_2(filter, increment, start_filter_index, x, scale, output); \ - break; +#define OPTIMIZE_LINE(x) \ + case (x): \ + calc_output_multi_mt_2(filter, increment, start_filter_index, x, scale, output); \ + break; switch (channels) // to kick the compile-time optimizer, channel numbers up to 16 are extracted as constants here. { - OPTIMIZE_LINE(1); - OPTIMIZE_LINE(2); - OPTIMIZE_LINE(3); - OPTIMIZE_LINE(4); - OPTIMIZE_LINE(5); - OPTIMIZE_LINE(6); - OPTIMIZE_LINE(7); - OPTIMIZE_LINE(8); - OPTIMIZE_LINE(9); - OPTIMIZE_LINE(10); - OPTIMIZE_LINE(11); - OPTIMIZE_LINE(12); - OPTIMIZE_LINE(13); - OPTIMIZE_LINE(14); - OPTIMIZE_LINE(15); - OPTIMIZE_LINE(16); + OPTIMIZE_LINE(1); + OPTIMIZE_LINE(2); + OPTIMIZE_LINE(3); + OPTIMIZE_LINE(4); + OPTIMIZE_LINE(5); + OPTIMIZE_LINE(6); + OPTIMIZE_LINE(7); + OPTIMIZE_LINE(8); + OPTIMIZE_LINE(9); + OPTIMIZE_LINE(10); + OPTIMIZE_LINE(11); + OPTIMIZE_LINE(12); + OPTIMIZE_LINE(13); + OPTIMIZE_LINE(14); + OPTIMIZE_LINE(15); + OPTIMIZE_LINE(16); default: - calc_output_multi_mt_2(filter, increment, start_filter_index, channels, scale, output); - break; + calc_output_multi_mt_2(filter, increment, start_filter_index, channels, scale, output); + break; } #undef OPTIMIZE_LINE } static SRC_ERROR _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, - SRC_STATE * const state, SRC_DATA * const data, SRC_STATE *const main_state ) + SRC_STATE *const state, SRC_DATA *const data, SRC_STATE *const main_state) { if (state->private_data == NULL) - return SRC_ERR_NO_PRIVATE; + return SRC_ERR_NO_PRIVATE; SINC_FILTER *filter = (SINC_FILTER *)state->private_data; SINC_FILTER *main_filter = (SINC_FILTER *)main_state->private_data; /* If there is not a problem, this will be optimised out. */ if (sizeof(filter->buffer[0]) != sizeof(data->data_in[0])) - return SRC_ERR_SIZE_INCOMPATIBILITY; + return SRC_ERR_SIZE_INCOMPATIBILITY; const int channels = state->channels; filter->in_count = data->input_frames * channels; @@ -407,12 +413,12 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, double src_ratio = state->last_ratio; if (is_bad_src_ratio(src_ratio)) - return SRC_ERR_BAD_INTERNAL_STATE; + return SRC_ERR_BAD_INTERNAL_STATE; /* Check the sample rate ratio wrt the buffer len. */ double count = (filter->coeff_half_len + 2.0) / filter->index_inc; if (MIN(state->last_ratio, data->src_ratio) < 1.0) - count /= MIN(state->last_ratio, data->src_ratio); + count /= MIN(state->last_ratio, data->src_ratio); /* Maximum coefficientson either side of center point. */ const int half_filter_chan_len = channels * (int)(psf_lrint(count) + 1); @@ -436,80 +442,83 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, /* Main processing loop. */ int interleave_counter = 0; - float * const data_out = data->data_out; - + float *const data_out = data->data_out; + while (filter->out_gen < out_count) { - /* Need to reload buffer? */ - int samples_in_hand = ( filter->b_end < filter->b_current ) ? (filter->b_end - filter->b_current + filter->b_len) : (filter->b_end - filter->b_current); + /* Need to reload buffer? */ + int samples_in_hand = (filter->b_end < filter->b_current) ? (filter->b_end - filter->b_current + filter->b_len) : (filter->b_end - filter->b_current); - if (samples_in_hand <= half_filter_chan_len) + if (samples_in_hand <= half_filter_chan_len) + { + // only one buffer is used (shared by all threads) { - // only one buffer is used (shared by all threads) + #pragma omp barrier + #pragma omp single + { + state->error = prepare_data(filter, channels, data, half_filter_chan_len); + + *main_state = *state; + *main_filter = *filter; + } + #pragma omp barrier { - #pragma omp barrier - #pragma omp single - { - state->error = prepare_data(filter, channels, data, half_filter_chan_len); - - *main_state = *state; - *main_filter = *filter; - } - #pragma omp barrier - { - *state = *main_state; - *filter = *main_filter; - } + *state = *main_state; + *filter = *main_filter; } + } - if (state->error != 0) - return state->error; + if (state->error != 0) + return state->error; - samples_in_hand = ( filter->b_end < filter->b_current ) ? (filter->b_end - filter->b_current + filter->b_len) : (filter->b_end - filter->b_current); - if (samples_in_hand <= half_filter_chan_len) - break; - }; + samples_in_hand = (filter->b_end < filter->b_current) ? (filter->b_end - filter->b_current + filter->b_len) : (filter->b_end - filter->b_current); + if (samples_in_hand <= half_filter_chan_len) + break; + }; - /* This is the termination condition. */ - if (filter->b_real_end >= 0) - { - if (filter->b_current + input_index + terminate > filter->b_real_end) - break; - }; + /* This is the termination condition. */ + if (filter->b_real_end >= 0) + { + if (filter->b_current + input_index + terminate > filter->b_real_end) + break; + }; - double scale, float_increment; - increment_t increment; - if (!is_constant_ratio) - { - if (out_count > 0 && fabs(state->last_ratio - data->src_ratio) > 1e-10) - src_ratio = state->last_ratio + filter->out_gen * (data->src_ratio - state->last_ratio) / out_count; + double scale, float_increment; + increment_t increment; + if (!is_constant_ratio) + { + if (out_count > 0 && fabs(state->last_ratio - data->src_ratio) > 1e-10) + src_ratio = state->last_ratio + filter->out_gen * (data->src_ratio - state->last_ratio) / out_count; - float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0); - increment = double_to_fp(float_increment); - scale = float_increment / index_inc; - } - else - { - float_increment = constant_float_increment; - increment = constant_increment; - scale = constant_scale; - } + float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0); + increment = double_to_fp(float_increment); + scale = float_increment / index_inc; + } + else + { + float_increment = constant_float_increment; + increment = constant_increment; + scale = constant_scale; + } - increment_t start_filter_index = double_to_fp(input_index * float_increment); + increment_t start_filter_index = double_to_fp(input_index * float_increment); - if ( child_no == interleave_counter ){ - calc_output_multi_mt(filter, increment, start_filter_index, channels, scale, data_out + filter->out_gen); - } - if ( ++interleave_counter == num_of_threads ) interleave_counter = 0; - filter->out_gen += channels; + if (child_no == interleave_counter) + { + calc_output_multi_mt(filter, increment, start_filter_index, channels, scale, data_out + filter->out_gen); + } + if (++interleave_counter == num_of_threads) + interleave_counter = 0; + filter->out_gen += channels; - /* Figure out the next index. */ - input_index += (is_constant_ratio) ? constant_input_index_inc : 1.0 / src_ratio; - rem = fmod_one(input_index); + /* Figure out the next index. */ + input_index += (is_constant_ratio) ? constant_input_index_inc : 1.0 / src_ratio; + rem = fmod_one(input_index); - filter->b_current = (filter->b_current + channels * psf_lrint(input_index - rem)); - if ( filter->b_current >= filter->b_len ) filter->b_current -= filter->b_len; - input_index = rem; + filter->b_current = (filter->b_current + channels * psf_lrint(input_index - rem)); + if (filter->b_current >= filter->b_len) + filter->b_current -= filter->b_len; + input_index = rem; }; state->last_position = input_index; @@ -527,7 +536,7 @@ static SRC_ERROR sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) { if (state->private_data == NULL) - return SRC_ERR_NO_PRIVATE; + return SRC_ERR_NO_PRIVATE; const int channels = state->channels; @@ -549,55 +558,54 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) SRC_ERROR retval = SRC_ERR_MALLOC_FAILED; - if ( !per_thread_state || !per_thread_data || !per_thread_filter - || !per_thread_retval ) + if (!per_thread_state || !per_thread_data || !per_thread_filter || !per_thread_retval) { - goto cleanup_and_return; + goto cleanup_and_return; } // OpenMP omp_set_dynamic(0); omp_set_num_threads(num_of_threads); - assert( num_of_threads == omp_get_num_threads() ); + assert(num_of_threads == omp_get_num_threads()); if (num_of_threads == 1) // w/o OpenMP { - per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, state, data, state); + per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, state, data, state); - retval = per_thread_retval[0]; + retval = per_thread_retval[0]; - goto cleanup_and_return; + goto cleanup_and_return; } int omp_child_no; -#pragma omp parallel for + #pragma omp parallel for for (omp_child_no = 0; omp_child_no < num_of_threads; omp_child_no++) { - const int child_no = omp_child_no; + const int child_no = omp_child_no; - memcpy(&per_thread_data[child_no], data, sizeof(SRC_DATA)); - memcpy(&per_thread_filter[child_no], filter, sizeof(SINC_FILTER)); + memcpy(&per_thread_data[child_no], data, sizeof(SRC_DATA)); + memcpy(&per_thread_filter[child_no], filter, sizeof(SINC_FILTER)); - memcpy(&per_thread_state[child_no], state, sizeof(SRC_STATE)); - per_thread_state[child_no].private_data = &per_thread_filter[child_no]; + memcpy(&per_thread_state[child_no], state, sizeof(SRC_STATE)); + per_thread_state[child_no].private_data = &per_thread_filter[child_no]; - per_thread_filter[child_no].buffer = filter->buffer; + per_thread_filter[child_no].buffer = filter->buffer; - per_thread_retval[child_no] = _sinc_multichan_vari_process_mt( - num_of_threads, child_no, - &per_thread_state[child_no], &per_thread_data[child_no], state); + per_thread_retval[child_no] = _sinc_multichan_vari_process_mt( + num_of_threads, child_no, + &per_thread_state[child_no], &per_thread_data[child_no], state); } // error checking for each worker for (int child_no = 0; child_no < num_of_threads; child_no++) { - if (per_thread_retval[child_no] != SRC_ERR_NO_ERROR) - { - retval = per_thread_retval[child_no]; - goto cleanup_and_return; - } + if (per_thread_retval[child_no] != SRC_ERR_NO_ERROR) + { + retval = per_thread_retval[child_no]; + goto cleanup_and_return; + } } // update filter status @@ -607,7 +615,7 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) memcpy(state, &per_thread_state[0], sizeof(SRC_STATE)); state->private_data = filter; - + memcpy(data, &per_thread_data[0], sizeof(SRC_DATA)); retval = SRC_ERR_NO_ERROR; @@ -615,16 +623,16 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) cleanup_and_return: if (per_thread_state) - free(per_thread_state); + free(per_thread_state); if (per_thread_data) - free(per_thread_data); + free(per_thread_data); if (per_thread_filter) - free(per_thread_filter); + free(per_thread_filter); if (per_thread_retval) - free(per_thread_retval); + free(per_thread_retval); return retval; } From 99d439649e4696ba61e49224d30c29ccd17b6b78 Mon Sep 17 00:00:00 2001 From: ss3git <3226388001@jcom.home.ne.jp> Date: Fri, 17 Nov 2023 02:43:46 +0900 Subject: [PATCH 11/18] fix for debug build --- src/src_sinc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/src_sinc.c b/src/src_sinc.c index 3fb77427..9c93f024 100644 --- a/src/src_sinc.c +++ b/src/src_sinc.c @@ -567,9 +567,9 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) omp_set_dynamic(0); omp_set_num_threads(num_of_threads); - assert(num_of_threads == omp_get_num_threads()); + //assert(num_of_threads == omp_get_max_threads()); - if (num_of_threads == 1) // w/o OpenMP + if (num_of_threads == 1 || omp_get_max_threads() == 1 ) // w/o OpenMP { per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, state, data, state); From 7348582e57dcf5c0ee65ff104f413893648489a8 Mon Sep 17 00:00:00 2001 From: ss3git <3226388001@jcom.home.ne.jp> Date: Fri, 17 Nov 2023 22:13:33 +0900 Subject: [PATCH 12/18] change WIN32 to MSVC for openmp build option switch --- src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 21ce661c..02306312 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -44,7 +44,7 @@ add_library(samplerate add_library(SampleRate::samplerate ALIAS samplerate) if(MULTI_THREADING) - if(WIN32) + if(MSVC) target_link_libraries(samplerate PRIVATE libomp) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /openmp:llvm") else() From bd6aec33ee6659f32b76ca365de6a4123580478a Mon Sep 17 00:00:00 2001 From: ss3git <3226388001@jcom.home.ne.jp> Date: Fri, 17 Nov 2023 22:25:56 +0900 Subject: [PATCH 13/18] cache once-calculated coeffs if src_ratio is integer --- src/src_sinc.c | 275 ++++++++++++++++++++++++++---- tests/multichan_throughput_test.c | 61 ++++--- 2 files changed, 273 insertions(+), 63 deletions(-) diff --git a/src/src_sinc.c b/src/src_sinc.c index 9c93f024..225570d1 100644 --- a/src/src_sinc.c +++ b/src/src_sinc.c @@ -229,7 +229,7 @@ sinc_get_description (int src_enum) #define ALWAYS_INLINE __attribute__((always_inline)) static #define mem_prefetch(ptr) __builtin_prefetch(ptr) - + #else #define ALWAYS_INLINE static @@ -239,8 +239,32 @@ sinc_get_description (int src_enum) /* smaller frames are processed in single thread to avoid overheads */ #define MULTI_THREADING_THRESHOLD (256) +#define MT_COEFFS_CACHING 1 + +enum MT_CACHE_MODE +{ + MT_CACHE_NONE, + MT_CACHE_READ, + MT_CACHE_WRITE +}; + +typedef struct mt_cache_t +{ + enum MT_CACHE_MODE cache_state; + increment_t start_filter_index; + double *coeffs; +} mt_cache_t; + +typedef struct mt_cache_array_t +{ + int len; + int len2; + mt_cache_t *caches; +} mt_cache_array_t; + ALWAYS_INLINE void -calc_output_multi_mt_core(const int enable_prefetch, const int skip_fraction, const SINC_FILTER *const filter, +calc_output_multi_mt_core(const enum MT_CACHE_MODE use_cache, mt_cache_t *const cache, const int cache_len, + const int enable_prefetch, const int skip_fraction, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output) { double left[MAX_CHANNELS] = {0}; @@ -251,6 +275,8 @@ calc_output_multi_mt_core(const int enable_prefetch, const int skip_fraction, co const int prefetch_increment = 8; + int cache_idx = 0; + { /* First apply the left half of the filter. */ increment_t filter_index1 = start_filter_index; @@ -270,19 +296,42 @@ calc_output_multi_mt_core(const int enable_prefetch, const int skip_fraction, co // left = 0.0; while (filter_index1 >= MAKE_INCREMENT_T(0)) { - if (enable_prefetch && filter_index1 - increment * prefetch_increment >= MAKE_INCREMENT_T(0)) + double coeff; + + if (use_cache == MT_CACHE_READ) + { + coeff = cache->coeffs[cache_idx++]; + + assert(cache_idx <= cache_len); + } + else { - const int indx = fp_to_int(filter_index1 - increment * prefetch_increment); - mem_prefetch(&filter->coeffs[indx]); - mem_prefetch(&filter->coeffs[indx + 1]); + if (enable_prefetch && filter_index1 - increment * prefetch_increment >= MAKE_INCREMENT_T(0)) + { + const int indx = fp_to_int(filter_index1 - increment * prefetch_increment); + mem_prefetch(&filter->coeffs[indx]); + mem_prefetch(&filter->coeffs[indx + 1]); + } + + const double fraction = fp_to_double(filter_index1); + const int indx = fp_to_int(filter_index1); + assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2); + const coeff_t coeff_val = filter->coeffs[indx]; + const coeff_t coeff_fraction = filter->coeffs[indx + 1] - filter->coeffs[indx]; + coeff = skip_fraction ? coeff_val : coeff_val + fraction * coeff_fraction; + assert(data_index1 >= 0 && data_index1 + channels - 1 < filter->b_len); + assert(data_index1 + channels - 1 < filter->b_end); + + if (use_cache == MT_CACHE_WRITE) + { + coeff = cache->coeffs[cache_idx++] = coeff; + + assert(cache_idx <= cache_len); + } } - const double fraction = fp_to_double(filter_index1); - const int indx = fp_to_int(filter_index1); - assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2); - const double icoeff = skip_fraction ? filter->coeffs[indx] : filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]); - assert(data_index1 >= 0 && data_index1 + channels - 1 < filter->b_len); - assert(data_index1 + channels - 1 < filter->b_end); + const double icoeff = coeff; + for (int ch = 0; ch < channels; ch++) left[ch] += icoeff * filter->buffer[data_index1 + ch]; @@ -301,34 +350,65 @@ calc_output_multi_mt_core(const int enable_prefetch, const int skip_fraction, co do { - if (enable_prefetch && filter_index2 - increment * prefetch_increment > MAKE_INCREMENT_T(0)) + double coeff; + + if (use_cache == MT_CACHE_READ) + { + coeff = cache->coeffs[cache_idx++]; + + assert(cache_idx <= cache_len); + } + else { - const int indx = fp_to_int(filter_index2 - increment * prefetch_increment); - mem_prefetch(&filter->coeffs[indx]); - mem_prefetch(&filter->coeffs[indx + 1]); + if (enable_prefetch && filter_index2 - increment * prefetch_increment > MAKE_INCREMENT_T(0)) + { + const int indx = fp_to_int(filter_index2 - increment * prefetch_increment); + mem_prefetch(&filter->coeffs[indx]); + mem_prefetch(&filter->coeffs[indx + 1]); + } + + const double fraction = fp_to_double(filter_index2); + const int indx = fp_to_int(filter_index2); + assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2); + const coeff_t coeff_val = filter->coeffs[indx]; + const coeff_t coeff_fraction = (float)filter->coeffs[indx + 1] - (float)filter->coeffs[indx]; + coeff = skip_fraction ? coeff_val : coeff_val + fraction * coeff_fraction; + assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len); + assert(data_index2 + channels - 1 < filter->b_end); + + if (use_cache == MT_CACHE_WRITE) + { + coeff = cache->coeffs[cache_idx++] = coeff; + + assert(cache_idx <= cache_len); + } } - const double fraction = fp_to_double(filter_index2); - const int indx = fp_to_int(filter_index2); - assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2); - const double icoeff = skip_fraction ? filter->coeffs[indx] : filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]); - assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len); - assert(data_index2 + channels - 1 < filter->b_end); + const double icoeff = coeff; + for (int ch = 0; ch < channels; ch++) right[ch] += icoeff * filter->buffer[data_index2 + ch]; + const double c_coeff = coeff; filter_index2 -= increment; data_index2 = data_index2 - channels; + } while (filter_index2 > MAKE_INCREMENT_T(0)); } + if (use_cache == MT_CACHE_WRITE) + { + cache->start_filter_index = start_filter_index; + cache->cache_state = MT_CACHE_READ; + } + for (int ch = 0; ch < channels; ch++) output[ch] = (float)(scale * (left[ch] + right[ch])); } /* calc_output_multi_mt_core */ ALWAYS_INLINE void -calc_output_multi_mt_2( - const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output) +calc_output_multi_mt_3(const int use_cache, mt_cache_t *const cache, const int cache_len, const SINC_FILTER *const filter, + const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output) { const int skip_fraction = increment == ((increment >> SHIFT_BITS) << SHIFT_BITS) && start_filter_index == ((start_filter_index >> SHIFT_BITS) << SHIFT_BITS) ? 1 : 0; @@ -338,32 +418,81 @@ calc_output_multi_mt_2( { if (enable_prefetch) { - calc_output_multi_mt_core(1, 1, filter, increment, start_filter_index, channels, scale, output); + calc_output_multi_mt_core(use_cache, cache, cache_len, 1, 1, filter, increment, start_filter_index, channels, scale, output); } else { - calc_output_multi_mt_core(0, 1, filter, increment, start_filter_index, channels, scale, output); + calc_output_multi_mt_core(use_cache, cache, cache_len, 0, 1, filter, increment, start_filter_index, channels, scale, output); } } else { if (enable_prefetch) { - calc_output_multi_mt_core(1, 0, filter, increment, start_filter_index, channels, scale, output); + calc_output_multi_mt_core(use_cache, cache, cache_len, 1, 0, filter, increment, start_filter_index, channels, scale, output); } else { - calc_output_multi_mt_core(0, 0, filter, increment, start_filter_index, channels, scale, output); + calc_output_multi_mt_core(use_cache, cache, cache_len, 0, 0, filter, increment, start_filter_index, channels, scale, output); } } } ALWAYS_INLINE void -calc_output_multi_mt(const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output) +calc_output_multi_mt_2(mt_cache_array_t *cache_array, const SINC_FILTER *const filter, + const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output) { -#define OPTIMIZE_LINE(x) \ - case (x): \ - calc_output_multi_mt_2(filter, increment, start_filter_index, x, scale, output); \ + const int cache_len = cache_array->len2; + const int idx = cache_array->len ? (int)(start_filter_index / (increment / (cache_array->len - 1))) : 0; + + mt_cache_t *cache = (cache_array->len && idx < cache_array->len) ? &cache_array->caches[idx] : NULL; + + enum MT_CACHE_MODE use_cache = MT_CACHE_NONE; + + if (cache) + { + enum MT_CACHE_MODE cache_state = cache->cache_state; + + if (cache_state == MT_CACHE_READ) + { + if (start_filter_index == cache->start_filter_index) + { + use_cache = MT_CACHE_READ; + } + // else { + // assert(0); // not expected to come here, but not harmful, so commenting out + // exit(0); + // } + } + else if (cache_state == MT_CACHE_NONE) + { + cache->cache_state = MT_CACHE_WRITE; + use_cache = MT_CACHE_WRITE; + } + } + + if (use_cache == MT_CACHE_READ) + { + // skip to core, since skip_fraction/enable_prefetch will not affect + calc_output_multi_mt_core(MT_CACHE_READ, cache, cache_len, 0, 0, filter, increment, start_filter_index, channels, scale, output); + } + else if (use_cache == MT_CACHE_WRITE) + { + calc_output_multi_mt_3(MT_CACHE_WRITE, cache, cache_len, filter, increment, start_filter_index, channels, scale, output); + } + else + { + calc_output_multi_mt_3(MT_CACHE_NONE, cache, cache_len, filter, increment, start_filter_index, channels, scale, output); + } +} + +ALWAYS_INLINE void +calc_output_multi_mt(mt_cache_array_t *cache_array, + const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output) +{ +#define OPTIMIZE_LINE(x) \ + case (x): \ + calc_output_multi_mt_2(cache_array, filter, increment, start_filter_index, x, scale, output); \ break; switch (channels) // to kick the compile-time optimizer, channel numbers up to 16 are extracted as constants here. @@ -385,7 +514,7 @@ calc_output_multi_mt(const SINC_FILTER *const filter, const increment_t incremen OPTIMIZE_LINE(15); OPTIMIZE_LINE(16); default: - calc_output_multi_mt_2(filter, increment, start_filter_index, channels, scale, output); + calc_output_multi_mt_2(cache_array, filter, increment, start_filter_index, channels, scale, output); break; } #undef OPTIMIZE_LINE @@ -444,6 +573,56 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, int interleave_counter = 0; float *const data_out = data->data_out; + mt_cache_array_t _cache_array = {0}; + mt_cache_array_t *const cache_array = &_cache_array; + +#if MT_COEFFS_CACHING + // Caching once-calculated (interpolated) coeffs in memory is reasonable if the src_ratio is an integer + // because only limited number of coeffs are cyclically used in those cases. + // Drawback is that the processing speed can fluctuate if the condition (src_ratio) changes. + + if (is_constant_ratio && src_ratio == (int)src_ratio) + { + + do + { + int len = (int)src_ratio + 1; + int len2 = 0; + cache_array->caches = (mt_cache_t *)calloc(len, sizeof(mt_cache_t)); + if (!cache_array->caches) + { + break; + } + + for (int i = 0; i < len; i++) + { + len2 = ((filter->coeff_half_len + 2) / filter->index_inc + filter->index_inc); + cache_array->caches[i].coeffs = (double *)calloc(len2, sizeof(double)); + if (!cache_array->caches[i].coeffs) + { + for (int j = 0; j < i; j++) + { + free(cache_array->caches[j].coeffs); + cache_array->caches[j].coeffs = NULL; + } + free(cache_array->caches); + cache_array->caches = NULL; + break; + } + } + + if (cache_array->caches) + { + cache_array->len = len; + cache_array->len2 = len2; + } + + } while (0); + } +#endif + + int rtn = SRC_ERR_NO_ERROR; + while (filter->out_gen < out_count) { /* Need to reload buffer? */ @@ -469,7 +648,10 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, } if (state->error != 0) - return state->error; + { + rtn = state->error; + break; + } samples_in_hand = (filter->b_end < filter->b_current) ? (filter->b_end - filter->b_current + filter->b_len) : (filter->b_end - filter->b_current); if (samples_in_hand <= half_filter_chan_len) @@ -505,7 +687,7 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, if (child_no == interleave_counter) { - calc_output_multi_mt(filter, increment, start_filter_index, channels, scale, data_out + filter->out_gen); + calc_output_multi_mt(cache_array, filter, increment, start_filter_index, channels, scale, data_out + filter->out_gen); } if (++interleave_counter == num_of_threads) interleave_counter = 0; @@ -521,6 +703,25 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, input_index = rem; }; +#if MT_COEFFS_CACHING + { + if (cache_array->len) + { + for (int i = 0; i < cache_array->len; i++) + { + free(cache_array->caches[i].coeffs); + cache_array->caches[i].coeffs = NULL; + } + free(cache_array->caches); + cache_array->caches = NULL; + cache_array->len = 0; + } + } +#endif + + if (rtn) + return rtn; + state->last_position = input_index; /* Save current ratio rather then target ratio. */ @@ -567,9 +768,9 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data) omp_set_dynamic(0); omp_set_num_threads(num_of_threads); - //assert(num_of_threads == omp_get_max_threads()); + // assert(num_of_threads == omp_get_max_threads()); - if (num_of_threads == 1 || omp_get_max_threads() == 1 ) // w/o OpenMP + if (num_of_threads == 1 || omp_get_max_threads() == 1) // w/o OpenMP { per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, state, data, state); diff --git a/tests/multichan_throughput_test.c b/tests/multichan_throughput_test.c index 7c88852b..b0aa2ea5 100644 --- a/tests/multichan_throughput_test.c +++ b/tests/multichan_throughput_test.c @@ -36,10 +36,10 @@ static float input [BUFFER_LEN] ; #if (defined(ENABLE_SINC_FAST_CONVERTER) || defined(ENABLE_SINC_MEDIUM_CONVERTER) || \ defined(ENABLE_SINC_BEST_CONVERTER)) -static float output [BUFFER_LEN] ; +static float output [BUFFER_LEN*2] ; static void -throughput_test (int converter, int channels, long *best_throughput) +throughput_test (int converter, int channels, long *best_throughput, double src_ratio) { SRC_DATA src_data ; #if !defined(_WIN32) && defined(MULTI_THREADING) struct timespec start_gettime, finish_gettime; @@ -50,7 +50,7 @@ throughput_test (int converter, int channels, long *best_throughput) long total_frames = 0, throughput ; int error ; - printf (" %-30s %2d ", src_get_name (converter), channels) ; + printf (" %-30s %2d ", src_get_name (converter), channels) ; fflush (stdout) ; src_data.data_in = input ; @@ -59,7 +59,7 @@ throughput_test (int converter, int channels, long *best_throughput) src_data.data_out = output ; src_data.output_frames = ARRAY_LEN (output) / channels ; - src_data.src_ratio = 0.99 ; + src_data.src_ratio = src_ratio ; #ifdef _WIN32 Sleep (2000) ; @@ -94,23 +94,25 @@ throughput_test (int converter, int channels, long *best_throughput) } while (duration < 5.0) ; - if (src_data.input_frames_used != src_data.input_frames) - { printf ("\n\nLine %d : input frames used %ld should be %ld\n", __LINE__, src_data.input_frames_used, src_data.input_frames) ; - exit (1) ; - } ; - - if (fabs (src_data.src_ratio * src_data.input_frames_used - src_data.output_frames_gen) > 2) - { printf ("\n\nLine %d : input / output length mismatch.\n\n", __LINE__) ; - printf (" input len : %d\n", ARRAY_LEN (input) / channels) ; - printf (" output len : %ld (should be %g +/- 2)\n\n", src_data.output_frames_gen, - floor (0.5 + src_data.src_ratio * src_data.input_frames_used)) ; - exit (1) ; - } ; + if ( src_ratio <= 1.0 ){ + if (src_data.input_frames_used != src_data.input_frames) + { printf ("\n\nLine %d : input frames used %ld should be %ld\n", __LINE__, src_data.input_frames_used, src_data.input_frames) ; + exit (1) ; + } ; + + if (fabs (src_data.src_ratio * src_data.input_frames_used - src_data.output_frames_gen) > 2) + { printf ("\n\nLine %d : input / output length mismatch.\n\n", __LINE__) ; + printf (" input len : %d\n", ARRAY_LEN (input) / channels) ; + printf (" output len : %ld (should be %g +/- 2)\n\n", src_data.output_frames_gen, + floor (0.5 + src_data.src_ratio * src_data.input_frames_used)) ; + exit (1) ; + } ; + } throughput = lrint (floor (total_frames / duration)) ; if (!best_throughput) - { printf ("%5.2f %10ld\n", duration, throughput) ; + { printf ("%5.2f %10ld (x%7.2f)\n", duration, throughput, (throughput/src_ratio/44100)) ; } else { *best_throughput = MAX (throughput, *best_throughput) ; @@ -131,31 +133,38 @@ single_run (void) printf ("\n CPU name : %s\n", get_cpu_name ()) ; + double src_ratio[] = {0.99, 2.0, 7.0, 0.25}; + +for( int i=0 ; i Date: Fri, 17 Nov 2023 23:07:21 +0900 Subject: [PATCH 14/18] fix type and clarify zero --- src/src_sinc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/src_sinc.c b/src/src_sinc.c index 225570d1..9db3ace9 100644 --- a/src/src_sinc.c +++ b/src/src_sinc.c @@ -243,7 +243,7 @@ sinc_get_description (int src_enum) enum MT_CACHE_MODE { - MT_CACHE_NONE, + MT_CACHE_NONE = 0, MT_CACHE_READ, MT_CACHE_WRITE }; @@ -407,7 +407,7 @@ calc_output_multi_mt_core(const enum MT_CACHE_MODE use_cache, mt_cache_t *const } /* calc_output_multi_mt_core */ ALWAYS_INLINE void -calc_output_multi_mt_3(const int use_cache, mt_cache_t *const cache, const int cache_len, const SINC_FILTER *const filter, +calc_output_multi_mt_3(const enum MT_CACHE_MODE use_cache, mt_cache_t *const cache, const int cache_len, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output) { From 1c01a777eee1501882fa343989010fc540d35501 Mon Sep 17 00:00:00 2001 From: ss3git <3226388001@jcom.home.ne.jp> Date: Sat, 18 Nov 2023 00:01:49 +0900 Subject: [PATCH 15/18] unnecessary (wrongly merged) cast removal --- src/src_sinc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/src_sinc.c b/src/src_sinc.c index 9db3ace9..ba72029d 100644 --- a/src/src_sinc.c +++ b/src/src_sinc.c @@ -371,7 +371,7 @@ calc_output_multi_mt_core(const enum MT_CACHE_MODE use_cache, mt_cache_t *const const int indx = fp_to_int(filter_index2); assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2); const coeff_t coeff_val = filter->coeffs[indx]; - const coeff_t coeff_fraction = (float)filter->coeffs[indx + 1] - (float)filter->coeffs[indx]; + const coeff_t coeff_fraction = filter->coeffs[indx + 1] - filter->coeffs[indx]; coeff = skip_fraction ? coeff_val : coeff_val + fraction * coeff_fraction; assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len); assert(data_index2 + channels - 1 < filter->b_end); From 6d4abeb1839934458105afd7ad93ac0b03ca588d Mon Sep 17 00:00:00 2001 From: ss3git <3226388001@jcom.home.ne.jp> Date: Sun, 19 Nov 2023 02:25:48 +0900 Subject: [PATCH 16/18] change _WIN32 to _MSC_VER --- src/src_sinc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/src_sinc.c b/src/src_sinc.c index ba72029d..9b236bd1 100644 --- a/src/src_sinc.c +++ b/src/src_sinc.c @@ -219,7 +219,7 @@ sinc_get_description (int src_enum) #include -#ifdef _WIN32 +#ifdef _MSC_VER #define ALWAYS_INLINE __forceinline #include From c0aa8796a876a662c7791dde6a8b7f3beece9cb0 Mon Sep 17 00:00:00 2001 From: ss3git <3226388001@jcom.home.ne.jp> Date: Sat, 16 Dec 2023 19:01:20 +0900 Subject: [PATCH 17/18] loop termination condition fix --- src/src_sinc.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/src_sinc.c b/src/src_sinc.c index 9b236bd1..f83576b9 100644 --- a/src/src_sinc.c +++ b/src/src_sinc.c @@ -661,8 +661,22 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, /* This is the termination condition. */ if (filter->b_real_end >= 0) { - if (filter->b_current + input_index + terminate > filter->b_real_end) + #if 1 // perhaps this is the intended termination condition. + if (filter->b_current + input_index + terminate + channels - 1 > filter->b_real_end) break; + + #else // this one matches to the current (0.22) single-thread implementation, but it is seemingly a bug. + if (channels == 1) + { + if (filter->b_current + input_index + terminate > filter->b_real_end) + break; + } + else + { + if (filter->b_current + input_index + terminate >= filter->b_real_end) + break; + } + #endif }; double scale, float_increment; From d509240e6e992594719f8bb8e772dd2dae9c9ae4 Mon Sep 17 00:00:00 2001 From: ss3git <3226388001@jcom.home.ne.jp> Date: Sat, 16 Dec 2023 22:21:25 +0900 Subject: [PATCH 18/18] undo loop termination condition fix, and make the decision equivalent to the single-thread version. note: the (single-thread) implementation may have some underlying bug here because the number of output frames is seemingly inconsistent depending on the combinations of src_ratio, input frames, and number of channels. The fix should be out of scope for this PR (MultiThreading). --- src/src_sinc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/src_sinc.c b/src/src_sinc.c index f83576b9..9fee6fb0 100644 --- a/src/src_sinc.c +++ b/src/src_sinc.c @@ -661,11 +661,9 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, /* This is the termination condition. */ if (filter->b_real_end >= 0) { - #if 1 // perhaps this is the intended termination condition. - if (filter->b_current + input_index + terminate + channels - 1 > filter->b_real_end) - break; - - #else // this one matches to the current (0.22) single-thread implementation, but it is seemingly a bug. + // This switching is necessary to match the outputs to the current (0.22) single-thread implementation. + // However, the (single-thread) implementation may have some underlying bug because the number of output frames is seemingly + // inconsistent depending on the combinations of src_ratio, input frames, and number of channels. if (channels == 1) { if (filter->b_current + input_index + terminate > filter->b_real_end) @@ -676,7 +674,6 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, if (filter->b_current + input_index + terminate >= filter->b_real_end) break; } - #endif }; double scale, float_increment;