From 516b1b473de34b15995b551a294fe66f9abd87fc Mon Sep 17 00:00:00 2001
From: ss3git <3226388001@jcom.home.ne.jp>
Date: Thu, 14 Sep 2023 22:16:32 +0900
Subject: [PATCH 01/18] libsamplerate multi-thread (OpenMP) implementation

compiled and tested on
--
FreeBSD clang version 14.0.5 (https://github.com/llvm/llvm-project.git llvmorg-14.0.5-0-gc12386ae247c)
Target: x86_64-unknown-freebsd13.2
Thread model: posix
--
---
 CMakeLists.txt                    |   5 +
 src/CMakeLists.txt                |   5 +
 src/src_sinc.c                    | 394 ++++++++++++++++++++++++++++++
 tests/multichan_throughput_test.c |  15 ++
 tests/throughput_test.c           |  15 ++
 5 files changed, 434 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f5431dab..13cb59ca 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,6 +72,11 @@ endif()
 if(CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
   option(LIBSAMPLERATE_ENABLE_SANITIZERS "Enable ASAN and UBSAN" OFF)
 
+  #if (BUILD_MULTI_THREADING)
+    set(MULTI_THREADING 1)
+    add_definitions(-DMULTI_THREADING)
+  #endif()
+
   if(LIBSAMPLERATE_ENABLE_SANITIZERS)
     # Use ASAN and UBSAN, make it fail on any error, improve stack traces
     set(sanitizer_flags -fsanitize=address,undefined -fno-sanitize-recover=all -fno-omit-frame-pointer)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4a0e6c87..6500caf5 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -72,6 +72,11 @@ if(LIBM_REQUIRED)
     target_link_libraries(samplerate PRIVATE m)
 endif()
 
+if(MULTI_THREADING)
+  target_link_libraries(samplerate PRIVATE omp)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS_RELEASE} -fopenmp")
+endif()
+
 # Set public header
 set_property(TARGET samplerate PROPERTY PUBLIC_HEADER ${PROJECT_SOURCE_DIR}/include/samplerate.h)
 
diff --git a/src/src_sinc.c b/src/src_sinc.c
index 716c4a40..6f757439 100644
--- a/src/src_sinc.c
+++ b/src/src_sinc.c
@@ -130,6 +130,18 @@ static SRC_STATE_VT sinc_mono_state_vt =
 	sinc_close
 } ;
 
+#ifdef MULTI_THREADING
+static SRC_ERROR sinc_multithread_vari_process (SRC_STATE *state, SRC_DATA *data) ;
+static SRC_STATE_VT sinc_multithread_state_vt =
+{
+	sinc_multithread_vari_process,
+	sinc_multithread_vari_process,
+	sinc_reset,
+	sinc_copy,
+	sinc_close
+} ;
+#endif
+
 static inline increment_t
 double_to_fp (double x)
 {	return (increment_t) (psf_lrint ((x) * FP_ONE)) ;
@@ -203,6 +215,384 @@ sinc_get_description (int src_enum)
 	return NULL ;
 } /* sinc_get_descrition */
 
+#ifdef MULTI_THREADING
+
+#include <omp.h>
+#include <unistd.h>
+
+#define MULTI_THREADING_THRESHOLD (256)
+
+__attribute__((always_inline)) static void
+calc_output_multi_mt_core(const int NUM_OF_THREADS, const int child_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
+{
+	assert(NUM_OF_THREADS == 1 || NUM_OF_THREADS % 2 == 0);
+	double left[MAX_CHANNELS] = {0};
+	double right[MAX_CHANNELS] = {0};
+
+	/* Convert input parameters into fixed point. */
+	const increment_t max_filter_index = int_to_fp(filter->coeff_half_len);
+
+	const int mt_increment_factor = (NUM_OF_THREADS > 1) ? NUM_OF_THREADS / 2 : 1;
+	const int mt_left_right_sw = child_no % 2;
+	const int mt_shift = child_no / 2;
+
+	if (!mt_left_right_sw || NUM_OF_THREADS == 1)
+	{
+			/* First apply the left half of the filter. */
+			increment_t filter_index1 = start_filter_index;
+			const int coeff_count1 = (max_filter_index - filter_index1) / increment;
+			filter_index1 = filter_index1 + coeff_count1 * increment;
+			int data_index1 = filter->b_current - channels * coeff_count1;
+
+			if (data_index1 < 0) /* Avoid underflow access to filter->buffer. */
+			{
+				int steps = int_div_ceil(-data_index1, channels);
+				/* If the assert triggers we would have to take care not to underflow/overflow */
+				assert(steps <= int_div_ceil(filter_index1, increment));
+				filter_index1 -= increment * steps;
+				data_index1 += steps * channels;
+			}
+
+			filter_index1 -= increment * mt_shift;
+			data_index1 = data_index1 + channels * mt_shift;
+
+			// left = 0.0;
+			while (filter_index1 >= MAKE_INCREMENT_T(0))
+			{
+				const double fraction = fp_to_double(filter_index1);
+				const int indx = fp_to_int(filter_index1);
+				assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2);
+				const double icoeff = filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]);
+				assert(data_index1 >= 0 && data_index1 + channels - 1 < filter->b_len);
+				assert(data_index1 + channels - 1 < filter->b_end);
+				for (int ch = 0; ch < channels; ch++)
+					left[ch] += icoeff * filter->buffer[data_index1 + ch];
+
+				filter_index1 -= increment * mt_increment_factor;
+				data_index1 = data_index1 + channels * mt_increment_factor;
+			};
+	}
+
+	if (mt_left_right_sw || NUM_OF_THREADS == 1)
+	{
+			/* Now apply the right half of the filter. */
+			increment_t filter_index2 = increment - start_filter_index;
+			const int coeff_count2 = (max_filter_index - filter_index2) / increment;
+			filter_index2 = filter_index2 + coeff_count2 * increment;
+			int data_index2 = filter->b_current + channels * (1 + coeff_count2);
+			// right = 0.0;
+
+			if (!mt_shift)
+			{
+				const double fraction = fp_to_double(filter_index2);
+				const int indx = fp_to_int(filter_index2);
+				assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2);
+				const double icoeff = filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]);
+				assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len);
+				assert(data_index2 + channels - 1 < filter->b_end);
+				for (int ch = 0; ch < channels; ch++)
+					right[ch] += icoeff * filter->buffer[data_index2 + ch];
+			}
+
+			filter_index2 -= increment;
+			data_index2 = data_index2 - channels;
+
+			filter_index2 -= increment * mt_shift;
+			data_index2 = data_index2 - channels * mt_shift;
+
+			while (filter_index2 > MAKE_INCREMENT_T(0))
+			{
+				const double fraction = fp_to_double(filter_index2);
+				const int indx = fp_to_int(filter_index2);
+				assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2);
+				const double icoeff = filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]);
+				assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len);
+				assert(data_index2 + channels - 1 < filter->b_end);
+				for (int ch = 0; ch < channels; ch++)
+					right[ch] += icoeff * filter->buffer[data_index2 + ch];
+
+				filter_index2 -= increment * mt_increment_factor;
+				data_index2 = data_index2 - channels * mt_increment_factor;
+			}
+	}
+
+	for (int ch = 0; ch < channels; ch++)
+			output[ch] = (scale * (left[ch] + right[ch])); // double
+} /* calc_output_stereo */
+
+__attribute__((always_inline)) static void
+calc_output_multi_mt_2(const int NUM_OF_THREADS, const int child_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
+{
+#define OPTIMIZE_LINE(x)                                                                                                  \
+	case (x):                                                                                                             \
+			calc_output_multi_mt_core(NUM_OF_THREADS, child_no, filter, increment, start_filter_index, x, scale, output); \
+			break;
+
+	switch (channels)
+	{
+			OPTIMIZE_LINE(1);
+			OPTIMIZE_LINE(2);
+			OPTIMIZE_LINE(3);
+			OPTIMIZE_LINE(4);
+			OPTIMIZE_LINE(5);
+			OPTIMIZE_LINE(6);
+			OPTIMIZE_LINE(7);
+			OPTIMIZE_LINE(8);
+			OPTIMIZE_LINE(9);
+			OPTIMIZE_LINE(10);
+			OPTIMIZE_LINE(11);
+			OPTIMIZE_LINE(12);
+			OPTIMIZE_LINE(13);
+			OPTIMIZE_LINE(14);
+			OPTIMIZE_LINE(15);
+			OPTIMIZE_LINE(16);
+	default:
+			calc_output_multi_mt_core(NUM_OF_THREADS, child_no, filter, increment, start_filter_index, channels, scale, output);
+			break;
+	}
+#undef OPTIMIZE_LINE
+}
+
+__attribute__((always_inline)) static void
+calc_output_multi_mt(const int NUM_OF_THREADS, const int child_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
+{
+#define OPTIMIZE_LINE(x)                                                                                         \
+	case (x):                                                                                                    \
+			calc_output_multi_mt_2(x, child_no, filter, increment, start_filter_index, channels, scale, output); \
+			break;
+
+	switch (NUM_OF_THREADS)
+	{
+			OPTIMIZE_LINE(1);
+			OPTIMIZE_LINE(2);
+			OPTIMIZE_LINE(4);
+			OPTIMIZE_LINE(6);
+			OPTIMIZE_LINE(8);
+			OPTIMIZE_LINE(10);
+			OPTIMIZE_LINE(12);
+			OPTIMIZE_LINE(14);
+			OPTIMIZE_LINE(16);
+	default:
+			calc_output_multi_mt_2(NUM_OF_THREADS, child_no, filter, increment, start_filter_index, channels, scale, output);
+			break;
+	}
+#undef OPTIMIZE_LINE
+}
+
+static SRC_ERROR
+_sinc_multichan_vari_process_mt(const int NUM_OF_THREADS, const int child_no, SRC_STATE *const state, SRC_DATA *const data)
+{
+	SINC_FILTER *filter = (SINC_FILTER *)state->private_data;
+	double input_index, src_ratio, count, float_increment, terminate, rem;
+	increment_t increment, start_filter_index;
+	int half_filter_chan_len, samples_in_hand;
+
+	if (state->private_data == NULL)
+			return SRC_ERR_NO_PRIVATE;
+
+	/* If there is not a problem, this will be optimised out. */
+	if (sizeof(filter->buffer[0]) != sizeof(data->data_in[0]))
+			return SRC_ERR_SIZE_INCOMPATIBILITY;
+
+	const int channels = state->channels;
+	filter->in_count = data->input_frames * channels;
+	filter->out_count = data->output_frames * channels;
+	filter->in_used = filter->out_gen = 0;
+
+	src_ratio = state->last_ratio;
+
+	if (is_bad_src_ratio(src_ratio))
+			return SRC_ERR_BAD_INTERNAL_STATE;
+
+	/* Check the sample rate ratio wrt the buffer len. */
+	count = (filter->coeff_half_len + 2.0) / filter->index_inc;
+	if (MIN(state->last_ratio, data->src_ratio) < 1.0)
+			count /= MIN(state->last_ratio, data->src_ratio);
+
+	/* Maximum coefficientson either side of center point. */
+	half_filter_chan_len = state->channels * (int)(psf_lrint(count) + 1);
+
+	input_index = state->last_position;
+
+	rem = fmod_one(input_index);
+	filter->b_current = (filter->b_current + channels * psf_lrint(input_index - rem)) % filter->b_len;
+	input_index = rem;
+
+	terminate = 1.0 / src_ratio + 1e-20;
+
+	double *const data_out = (double *const)data->data_out;
+	const int out_count = filter->out_count;
+	const int index_inc = filter->index_inc;
+
+	/* Main processing loop. */
+	while (filter->out_gen < out_count)
+	{
+			/* Need to reload buffer? */
+			samples_in_hand = (filter->b_end - filter->b_current + filter->b_len) % filter->b_len;
+
+			if (samples_in_hand <= half_filter_chan_len)
+			{
+				if ((state->error = prepare_data(filter, channels, data, half_filter_chan_len)) != 0)
+					return state->error;
+
+				samples_in_hand = (filter->b_end - filter->b_current + filter->b_len) % filter->b_len;
+				if (samples_in_hand <= half_filter_chan_len)
+					break;
+			};
+
+			/* This is the termination condition. */
+			if (filter->b_real_end >= 0)
+			{
+				if (filter->b_current + input_index + terminate >= filter->b_real_end)
+					break;
+			};
+
+			if (out_count > 0 && fabs(state->last_ratio - data->src_ratio) > 1e-10)
+				src_ratio = state->last_ratio + filter->out_gen * (data->src_ratio - state->last_ratio) / out_count;
+
+			float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0);
+			increment = double_to_fp(float_increment);
+
+			start_filter_index = double_to_fp(input_index * float_increment);
+
+			calc_output_multi_mt(NUM_OF_THREADS, child_no, filter, increment, start_filter_index, channels, float_increment / index_inc, data_out + filter->out_gen);
+			filter->out_gen += channels;
+
+			/* Figure out the next index. */
+			input_index += 1.0 / src_ratio;
+			rem = fmod_one(input_index);
+
+			filter->b_current = (filter->b_current + channels * psf_lrint(input_index - rem)) % filter->b_len;
+			input_index = rem;
+	};
+
+	state->last_position = input_index;
+
+	/* Save current ratio rather then target ratio. */
+	state->last_ratio = src_ratio;
+
+	data->input_frames_used = filter->in_used / state->channels;
+	data->output_frames_gen = filter->out_gen / state->channels;
+
+	return SRC_ERR_NO_ERROR;
+}
+
+static SRC_ERROR
+sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
+{
+	if (state->private_data == NULL)
+			return SRC_ERR_NO_PRIVATE;
+
+	const long in_count = data->input_frames * state->channels;
+	const long out_count = data->output_frames * state->channels;
+
+	SINC_FILTER *filter = (SINC_FILTER *)state->private_data;
+	const int filter_buffer_len = (filter->b_len + state->channels);
+
+	const int should_be_single_thread = (sysconf(_SC_NPROCESSORS_ONLN) < 2 || in_count < MULTI_THREADING_THRESHOLD);
+
+	const int NUM_OF_THREADS = should_be_single_thread ? 1 : (sysconf(_SC_NPROCESSORS_ONLN) / 2 * 2);
+
+	SRC_STATE *per_thread_state = (SRC_STATE *)malloc(NUM_OF_THREADS * sizeof(SRC_STATE));
+	SRC_DATA *per_thread_data = (SRC_DATA *)malloc(NUM_OF_THREADS * sizeof(SRC_DATA));
+	SINC_FILTER *per_thread_filter = (SINC_FILTER *)malloc(NUM_OF_THREADS * sizeof(SINC_FILTER));
+	float **per_thread_buffer = (float **)malloc(NUM_OF_THREADS * sizeof(float *));
+	double **per_thread_data_out = (double **)malloc(NUM_OF_THREADS * sizeof(double *));
+	SRC_ERROR *per_thread_retval = (SRC_ERROR *)malloc(NUM_OF_THREADS * sizeof(SRC_ERROR));
+
+	SRC_ERROR retval;
+
+	if (NUM_OF_THREADS == 1)
+	{
+			per_thread_data_out[0] = (double *)malloc(out_count * sizeof(double));
+			void *tmp_data_out = data->data_out;
+
+			data->data_out = (void *)per_thread_data_out[0];
+
+			per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, state, data);
+
+			data->data_out = tmp_data_out;
+
+			for (int count = 0; count < filter->out_gen; count++)
+			{
+				data->data_out[count] = per_thread_data_out[0][count];
+			}
+
+			free(per_thread_data_out[0]);
+			per_thread_data_out[0] = NULL;
+
+			goto cleanup_and_return;
+	}
+
+	// OpenMP
+	omp_set_num_threads(NUM_OF_THREADS);
+
+#pragma omp parallel for
+	for (int child_no = 0; child_no < NUM_OF_THREADS; child_no++)
+	{
+			memcpy(&per_thread_data[child_no], data, sizeof(SRC_DATA));
+			memcpy(&per_thread_filter[child_no], filter, sizeof(SINC_FILTER));
+
+			memcpy(&per_thread_state[child_no], state, sizeof(SRC_STATE));
+			per_thread_state[child_no].private_data = &per_thread_filter[child_no];
+
+			per_thread_buffer[child_no] = (float *)malloc(filter_buffer_len * sizeof(float));
+			memcpy(per_thread_buffer[child_no], filter->buffer, filter_buffer_len * sizeof(float));
+			per_thread_filter[child_no].buffer = per_thread_buffer[child_no];
+
+			per_thread_data_out[child_no] = (double *)malloc(out_count * sizeof(double));
+			per_thread_data[child_no].data_out = (void *)per_thread_data_out[child_no];
+
+			per_thread_retval[child_no] = _sinc_multichan_vari_process_mt(NUM_OF_THREADS, child_no,
+																		  &per_thread_state[child_no], &per_thread_data[child_no]);
+	}
+
+	memcpy(filter->buffer, per_thread_buffer[0], filter_buffer_len * sizeof(float));
+
+	float *buf = filter->buffer;
+	memcpy(filter, &per_thread_filter[0], sizeof(SINC_FILTER));
+	filter->buffer = buf;
+
+	memcpy(state, &per_thread_state[0], sizeof(SRC_STATE));
+	state->private_data = filter;
+
+	float *d_out = data->data_out;
+	memcpy(data, &per_thread_data[0], sizeof(SRC_DATA));
+	data->data_out = d_out;
+
+#pragma omp parallel for
+	for (int count = 0; count < filter->out_gen; count++)
+	{
+			double sum = 0.0;
+			for (int child_no = 0; child_no < NUM_OF_THREADS; child_no++)
+			{
+				sum += per_thread_data_out[child_no][count];
+			}
+			data->data_out[count] = (float)sum;
+	}
+
+#pragma omp parallel for
+	for (int child_no = 0; child_no < NUM_OF_THREADS; child_no++)
+	{
+			free(per_thread_buffer[child_no]);
+			free(per_thread_data_out[child_no]);
+	}
+
+cleanup_and_return:
+	retval = per_thread_retval[0];
+
+	free(per_thread_state);
+	free(per_thread_data);
+	free(per_thread_filter);
+	free(per_thread_buffer);
+	free(per_thread_retval);
+	free(per_thread_data_out);
+
+	return retval;
+}
+
+#endif /* MULTI_THREADING*/
+
 static SINC_FILTER *
 sinc_filter_new (int converter_type, int channels)
 {
@@ -282,6 +672,9 @@ sinc_state_new (int converter_type, int channels, SRC_ERROR *error)
 	state->channels = channels ;
 	state->mode = SRC_MODE_PROCESS ;
 
+	#ifdef MULTI_THREADING
+		state->vt = &sinc_multithread_state_vt ;
+	#else
 	if (state->channels == 1)
 		state->vt = &sinc_mono_state_vt ;
 	else if (state->channels == 2)
@@ -292,6 +685,7 @@ sinc_state_new (int converter_type, int channels, SRC_ERROR *error)
 		state->vt = &sinc_hex_state_vt ;
 	else
 		state->vt = &sinc_multichan_state_vt ;
+	#endif
 
 	state->private_data = sinc_filter_new (converter_type, state->channels) ;
 	if (!state->private_data)
diff --git a/tests/multichan_throughput_test.c b/tests/multichan_throughput_test.c
index 5cab44a1..9cf5639b 100644
--- a/tests/multichan_throughput_test.c
+++ b/tests/multichan_throughput_test.c
@@ -41,7 +41,11 @@ static float output [BUFFER_LEN] ;
 static void
 throughput_test (int converter, int channels, long *best_throughput)
 {	SRC_DATA src_data ;
+#ifdef MULTI_THREADING
+	struct timespec start_gettime, finish_gettime;
+#else
 	clock_t start_time, clock_time ;
+#endif
 	double duration ;
 	long total_frames = 0, throughput ;
 	int error ;
@@ -63,7 +67,11 @@ throughput_test (int converter, int channels, long *best_throughput)
 	sleep (2) ;
 #endif
 
+#ifdef MULTI_THREADING
+	clock_gettime(CLOCK_MONOTONIC, &start_gettime);
+#else
 	start_time = clock () ;
+#endif
 
 	do
 	{
@@ -74,8 +82,15 @@ throughput_test (int converter, int channels, long *best_throughput)
 
 		total_frames += src_data.output_frames_gen ;
 
+#ifdef MULTI_THREADING
+		clock_gettime(CLOCK_MONOTONIC, &finish_gettime);
+
+		duration = (finish_gettime.tv_sec - start_gettime.tv_sec);
+		duration += (finish_gettime.tv_nsec - start_gettime.tv_nsec) / 1000000000.0;
+#else
 		clock_time = clock () - start_time ;
 		duration = (1.0 * clock_time) / CLOCKS_PER_SEC ;
+#endif
 	}
 	while (duration < 5.0) ;
 
diff --git a/tests/throughput_test.c b/tests/throughput_test.c
index e9974800..0a6aa834 100644
--- a/tests/throughput_test.c
+++ b/tests/throughput_test.c
@@ -38,7 +38,11 @@ static float output [BUFFER_LEN] ;
 static long
 throughput_test (int converter, long best_throughput)
 {	SRC_DATA src_data ;
+#ifdef MULTI_THREADING
+	struct timespec start_gettime, finish_gettime;
+#else
 	clock_t start_time, clock_time ;
+#endif
 	double duration ;
 	long total_frames = 0, throughput ;
 	int error ;
@@ -60,7 +64,11 @@ throughput_test (int converter, long best_throughput)
 	sleep (2) ;
 #endif
 
+#ifdef MULTI_THREADING
+	clock_gettime(CLOCK_MONOTONIC, &start_gettime);
+#else
 	start_time = clock () ;
+#endif
 
 	do
 	{
@@ -71,11 +79,18 @@ throughput_test (int converter, long best_throughput)
 
 		total_frames += src_data.output_frames_gen ;
 
+#ifdef MULTI_THREADING
+		clock_gettime(CLOCK_MONOTONIC, &finish_gettime);
+
+		duration = (finish_gettime.tv_sec - start_gettime.tv_sec);
+		duration += (finish_gettime.tv_nsec - start_gettime.tv_nsec) / 1000000000.0;
+#else
 		clock_time = clock () - start_time ;
 #ifdef __GNU__ /* Clock resolution is 10ms on GNU/Hurd */
 		duration = (10000.0 * clock_time) / CLOCKS_PER_SEC ;
 #else
 		duration = (1.0 * clock_time) / CLOCKS_PER_SEC ;
+#endif
 #endif
 	}
 	while (duration < 3.0) ;

From df1aaae73ca1df4cdc7d72a73a70e40dcade0449 Mon Sep 17 00:00:00 2001
From: ss3git <3226388001@jcom.home.ne.jp>
Date: Thu, 14 Sep 2023 22:18:26 +0900
Subject: [PATCH 02/18] MT disable by default.

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 13cb59ca..4a8f2b3f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,10 +72,10 @@ endif()
 if(CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
   option(LIBSAMPLERATE_ENABLE_SANITIZERS "Enable ASAN and UBSAN" OFF)
 
-  #if (BUILD_MULTI_THREADING)
+  if (BUILD_MULTI_THREADING)
     set(MULTI_THREADING 1)
     add_definitions(-DMULTI_THREADING)
-  #endif()
+  endif()
 
   if(LIBSAMPLERATE_ENABLE_SANITIZERS)
     # Use ASAN and UBSAN, make it fail on any error, improve stack traces

From 8c0f67615176546647c9a880039d2a241e95675c Mon Sep 17 00:00:00 2001
From: ss3git <3226388001@jcom.home.ne.jp>
Date: Fri, 15 Sep 2023 19:19:26 +0900
Subject: [PATCH 03/18] error handlings and small refactor

---
 src/src_sinc.c | 155 +++++++++++++++++++++++++++++++------------------
 1 file changed, 97 insertions(+), 58 deletions(-)

diff --git a/src/src_sinc.c b/src/src_sinc.c
index 6f757439..fa66adbe 100644
--- a/src/src_sinc.c
+++ b/src/src_sinc.c
@@ -220,23 +220,25 @@ sinc_get_description (int src_enum)
 #include <omp.h>
 #include <unistd.h>
 
+/* smaller frames are processed in single thread to avoid overheads */
 #define MULTI_THREADING_THRESHOLD (256)
 
 __attribute__((always_inline)) static void
-calc_output_multi_mt_core(const int NUM_OF_THREADS, const int child_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
+calc_output_multi_mt_core(const int num_of_threads, const int child_no, 
+	const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
 {
-	assert(NUM_OF_THREADS == 1 || NUM_OF_THREADS % 2 == 0);
+	assert(num_of_threads == 1 || num_of_threads % 2 == 0);
 	double left[MAX_CHANNELS] = {0};
 	double right[MAX_CHANNELS] = {0};
 
 	/* Convert input parameters into fixed point. */
 	const increment_t max_filter_index = int_to_fp(filter->coeff_half_len);
 
-	const int mt_increment_factor = (NUM_OF_THREADS > 1) ? NUM_OF_THREADS / 2 : 1;
+	const int mt_increment_factor = (num_of_threads > 1) ? num_of_threads / 2 : 1;
 	const int mt_left_right_sw = child_no % 2;
 	const int mt_shift = child_no / 2;
 
-	if (!mt_left_right_sw || NUM_OF_THREADS == 1)
+	if (!mt_left_right_sw || num_of_threads == 1)
 	{
 			/* First apply the left half of the filter. */
 			increment_t filter_index1 = start_filter_index;
@@ -273,7 +275,7 @@ calc_output_multi_mt_core(const int NUM_OF_THREADS, const int child_no, const SI
 			};
 	}
 
-	if (mt_left_right_sw || NUM_OF_THREADS == 1)
+	if (mt_left_right_sw || num_of_threads == 1)
 	{
 			/* Now apply the right half of the filter. */
 			increment_t filter_index2 = increment - start_filter_index;
@@ -321,14 +323,15 @@ calc_output_multi_mt_core(const int NUM_OF_THREADS, const int child_no, const SI
 } /* calc_output_stereo */
 
 __attribute__((always_inline)) static void
-calc_output_multi_mt_2(const int NUM_OF_THREADS, const int child_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
+calc_output_multi_mt_2(const int num_of_threads, const int child_no, 
+	const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
 {
 #define OPTIMIZE_LINE(x)                                                                                                  \
 	case (x):                                                                                                             \
-			calc_output_multi_mt_core(NUM_OF_THREADS, child_no, filter, increment, start_filter_index, x, scale, output); \
+			calc_output_multi_mt_core(num_of_threads, child_no, filter, increment, start_filter_index, x, scale, output); \
 			break;
 
-	switch (channels)
+	switch (channels) // to kick the compile-time optimizer, channel numbers up to 16 are extracted as constants here.
 	{
 			OPTIMIZE_LINE(1);
 			OPTIMIZE_LINE(2);
@@ -347,21 +350,22 @@ calc_output_multi_mt_2(const int NUM_OF_THREADS, const int child_no, const SINC_
 			OPTIMIZE_LINE(15);
 			OPTIMIZE_LINE(16);
 	default:
-			calc_output_multi_mt_core(NUM_OF_THREADS, child_no, filter, increment, start_filter_index, channels, scale, output);
+			calc_output_multi_mt_core(num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, output);
 			break;
 	}
 #undef OPTIMIZE_LINE
 }
 
 __attribute__((always_inline)) static void
-calc_output_multi_mt(const int NUM_OF_THREADS, const int child_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
+calc_output_multi_mt(const int num_of_threads, const int child_no, 
+	const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
 {
 #define OPTIMIZE_LINE(x)                                                                                         \
 	case (x):                                                                                                    \
 			calc_output_multi_mt_2(x, child_no, filter, increment, start_filter_index, channels, scale, output); \
 			break;
 
-	switch (NUM_OF_THREADS)
+	switch (num_of_threads) // to kick the compile-time optimizer, the number of threads is extracted as constant here.
 	{
 			OPTIMIZE_LINE(1);
 			OPTIMIZE_LINE(2);
@@ -373,14 +377,15 @@ calc_output_multi_mt(const int NUM_OF_THREADS, const int child_no, const SINC_FI
 			OPTIMIZE_LINE(14);
 			OPTIMIZE_LINE(16);
 	default:
-			calc_output_multi_mt_2(NUM_OF_THREADS, child_no, filter, increment, start_filter_index, channels, scale, output);
+			calc_output_multi_mt_2(num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, output);
 			break;
 	}
 #undef OPTIMIZE_LINE
 }
 
 static SRC_ERROR
-_sinc_multichan_vari_process_mt(const int NUM_OF_THREADS, const int child_no, SRC_STATE *const state, SRC_DATA *const data)
+_sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, double *const per_thread_data_out,
+	SRC_STATE *const state, SRC_DATA *const data)
 {
 	SINC_FILTER *filter = (SINC_FILTER *)state->private_data;
 	double input_index, src_ratio, count, float_increment, terminate, rem;
@@ -420,8 +425,7 @@ _sinc_multichan_vari_process_mt(const int NUM_OF_THREADS, const int child_no, SR
 
 	terminate = 1.0 / src_ratio + 1e-20;
 
-	double *const data_out = (double *const)data->data_out;
-	const int out_count = filter->out_count;
+	const long out_count = filter->out_count;
 	const int index_inc = filter->index_inc;
 
 	/* Main processing loop. */
@@ -443,7 +447,7 @@ _sinc_multichan_vari_process_mt(const int NUM_OF_THREADS, const int child_no, SR
 			/* This is the termination condition. */
 			if (filter->b_real_end >= 0)
 			{
-				if (filter->b_current + input_index + terminate >= filter->b_real_end)
+				if (filter->b_current + input_index + terminate > filter->b_real_end)
 					break;
 			};
 
@@ -455,7 +459,7 @@ _sinc_multichan_vari_process_mt(const int NUM_OF_THREADS, const int child_no, SR
 
 			start_filter_index = double_to_fp(input_index * float_increment);
 
-			calc_output_multi_mt(NUM_OF_THREADS, child_no, filter, increment, start_filter_index, channels, float_increment / index_inc, data_out + filter->out_gen);
+			calc_output_multi_mt(num_of_threads, child_no, filter, increment, start_filter_index, channels, float_increment / index_inc, per_thread_data_out + filter->out_gen);
 			filter->out_gen += channels;
 
 			/* Figure out the next index. */
@@ -491,62 +495,80 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 
 	const int should_be_single_thread = (sysconf(_SC_NPROCESSORS_ONLN) < 2 || in_count < MULTI_THREADING_THRESHOLD);
 
-	const int NUM_OF_THREADS = should_be_single_thread ? 1 : (sysconf(_SC_NPROCESSORS_ONLN) / 2 * 2);
+	const int num_of_threads = should_be_single_thread ? 1 : (sysconf(_SC_NPROCESSORS_ONLN) / 2 * 2);
 
-	SRC_STATE *per_thread_state = (SRC_STATE *)malloc(NUM_OF_THREADS * sizeof(SRC_STATE));
-	SRC_DATA *per_thread_data = (SRC_DATA *)malloc(NUM_OF_THREADS * sizeof(SRC_DATA));
-	SINC_FILTER *per_thread_filter = (SINC_FILTER *)malloc(NUM_OF_THREADS * sizeof(SINC_FILTER));
-	float **per_thread_buffer = (float **)malloc(NUM_OF_THREADS * sizeof(float *));
-	double **per_thread_data_out = (double **)malloc(NUM_OF_THREADS * sizeof(double *));
-	SRC_ERROR *per_thread_retval = (SRC_ERROR *)malloc(NUM_OF_THREADS * sizeof(SRC_ERROR));
+	SRC_STATE *per_thread_state = (SRC_STATE *)malloc(num_of_threads * sizeof(SRC_STATE));
+	SRC_DATA *per_thread_data = (SRC_DATA *)malloc(num_of_threads * sizeof(SRC_DATA));
+	SINC_FILTER *per_thread_filter = (SINC_FILTER *)malloc(num_of_threads * sizeof(SINC_FILTER));
+	SRC_ERROR *per_thread_retval = (SRC_ERROR *)malloc(num_of_threads * sizeof(SRC_ERROR));
+	
+	float **per_thread_buffer = (float **)calloc(num_of_threads, sizeof(float *));
+	double **per_thread_data_out = (double **)calloc(num_of_threads, sizeof(double *));
 
-	SRC_ERROR retval;
+	SRC_ERROR retval = SRC_ERR_MALLOC_FAILED;
 
-	if (NUM_OF_THREADS == 1)
+	if ( !per_thread_state || !per_thread_data || !per_thread_filter
+		 || !per_thread_buffer || !per_thread_data_out || !per_thread_retval )
 	{
-			per_thread_data_out[0] = (double *)malloc(out_count * sizeof(double));
-			void *tmp_data_out = data->data_out;
-
-			data->data_out = (void *)per_thread_data_out[0];
+			goto cleanup_and_return;
+	}
 
-			per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, state, data);
+	if (num_of_threads == 1)		// w/o OpenMP
+	{
+			per_thread_data_out[0] = (double *)malloc(out_count * sizeof(double));
+			if ( !per_thread_data_out[0] ) goto cleanup_and_return;
 
-			data->data_out = tmp_data_out;
+			per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, per_thread_data_out[0], state, data);
 
 			for (int count = 0; count < filter->out_gen; count++)
 			{
 				data->data_out[count] = per_thread_data_out[0][count];
 			}
 
-			free(per_thread_data_out[0]);
-			per_thread_data_out[0] = NULL;
+			retval = per_thread_retval[0];
 
 			goto cleanup_and_return;
 	}
 
 	// OpenMP
-	omp_set_num_threads(NUM_OF_THREADS);
+	omp_set_num_threads(num_of_threads);
 
 #pragma omp parallel for
-	for (int child_no = 0; child_no < NUM_OF_THREADS; child_no++)
+	for (int child_no = 0; child_no < num_of_threads; child_no++)
 	{
+			per_thread_buffer[child_no] = (float *)malloc(filter_buffer_len * sizeof(float));
+			per_thread_data_out[child_no] = (double *)malloc(out_count * sizeof(double));
+
+			if ( !per_thread_buffer[child_no] || !per_thread_data_out[child_no] ){
+
+				per_thread_retval[child_no] = SRC_ERR_MALLOC_FAILED;
+
+				continue;
+			}
+
 			memcpy(&per_thread_data[child_no], data, sizeof(SRC_DATA));
 			memcpy(&per_thread_filter[child_no], filter, sizeof(SINC_FILTER));
 
 			memcpy(&per_thread_state[child_no], state, sizeof(SRC_STATE));
 			per_thread_state[child_no].private_data = &per_thread_filter[child_no];
 
-			per_thread_buffer[child_no] = (float *)malloc(filter_buffer_len * sizeof(float));
 			memcpy(per_thread_buffer[child_no], filter->buffer, filter_buffer_len * sizeof(float));
 			per_thread_filter[child_no].buffer = per_thread_buffer[child_no];
 
-			per_thread_data_out[child_no] = (double *)malloc(out_count * sizeof(double));
-			per_thread_data[child_no].data_out = (void *)per_thread_data_out[child_no];
+			per_thread_retval[child_no] = _sinc_multichan_vari_process_mt(
+                num_of_threads, child_no, per_thread_data_out[child_no],
+				&per_thread_state[child_no], &per_thread_data[child_no]);
+	}
 
-			per_thread_retval[child_no] = _sinc_multichan_vari_process_mt(NUM_OF_THREADS, child_no,
-																		  &per_thread_state[child_no], &per_thread_data[child_no]);
+	// error checking for each worker
+	for (int child_no = 0; child_no < num_of_threads; child_no++){
+		if ( per_thread_retval[child_no] != SRC_ERR_NO_ERROR ){
+			retval = per_thread_retval[child_no];
+			goto cleanup_and_return;
+		}
 	}
 
+	// update filter status
 	memcpy(filter->buffer, per_thread_buffer[0], filter_buffer_len * sizeof(float));
 
 	float *buf = filter->buffer;
@@ -556,37 +578,54 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 	memcpy(state, &per_thread_state[0], sizeof(SRC_STATE));
 	state->private_data = filter;
 
-	float *d_out = data->data_out;
 	memcpy(data, &per_thread_data[0], sizeof(SRC_DATA));
-	data->data_out = d_out;
 
 #pragma omp parallel for
-	for (int count = 0; count < filter->out_gen; count++)
+	for (int count = 0; count < filter->out_gen; count++)   // sum up every worker's result
 	{
 			double sum = 0.0;
-			for (int child_no = 0; child_no < NUM_OF_THREADS; child_no++)
+			for (int child_no = 0; child_no < num_of_threads; child_no++)
 			{
 				sum += per_thread_data_out[child_no][count];
 			}
 			data->data_out[count] = (float)sum;
 	}
 
-#pragma omp parallel for
-	for (int child_no = 0; child_no < NUM_OF_THREADS; child_no++)
+	retval = SRC_ERR_NO_ERROR;
+
+cleanup_and_return:
+
+	if (per_thread_state)
+			free(per_thread_state);
+
+	if (per_thread_data)
+			free(per_thread_data);
+
+	if (per_thread_filter)
+			free(per_thread_filter);
+	
+	if (per_thread_retval)
+			free(per_thread_retval);
+
+	if (per_thread_buffer)
 	{
-			free(per_thread_buffer[child_no]);
-			free(per_thread_data_out[child_no]);
+			for (int child_no = 0; child_no < num_of_threads; child_no++)
+			{
+				if (per_thread_buffer[child_no])
+					free(per_thread_buffer[child_no]);
+			}
+
+			free(per_thread_buffer);
 	}
 
-cleanup_and_return:
-	retval = per_thread_retval[0];
-
-	free(per_thread_state);
-	free(per_thread_data);
-	free(per_thread_filter);
-	free(per_thread_buffer);
-	free(per_thread_retval);
-	free(per_thread_data_out);
+	if (per_thread_data_out) {
+			for (int child_no = 0; child_no < num_of_threads; child_no++) {
+				if (per_thread_data_out[child_no])
+					free(per_thread_data_out[child_no]);
+			}
+
+			free(per_thread_data_out);
+	}
 
 	return retval;
 }

From 2799ec346715f6812c46e643b7d08c197d893299 Mon Sep 17 00:00:00 2001
From: ss3git <3226388001@jcom.home.ne.jp>
Date: Sat, 16 Sep 2023 20:19:02 +0900
Subject: [PATCH 04/18] some more optimize

---
 src/src_sinc.c | 91 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 60 insertions(+), 31 deletions(-)

diff --git a/src/src_sinc.c b/src/src_sinc.c
index fa66adbe..df0e6bc6 100644
--- a/src/src_sinc.c
+++ b/src/src_sinc.c
@@ -224,7 +224,7 @@ sinc_get_description (int src_enum)
 #define MULTI_THREADING_THRESHOLD (256)
 
 __attribute__((always_inline)) static void
-calc_output_multi_mt_core(const int num_of_threads, const int child_no, 
+calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, const int child_no, 
 	const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
 {
 	assert(num_of_threads == 1 || num_of_threads % 2 == 0);
@@ -264,7 +264,7 @@ calc_output_multi_mt_core(const int num_of_threads, const int child_no,
 				const double fraction = fp_to_double(filter_index1);
 				const int indx = fp_to_int(filter_index1);
 				assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2);
-				const double icoeff = filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]);
+				const double icoeff = skip_fraction ? filter->coeffs[indx] : filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]);
 				assert(data_index1 >= 0 && data_index1 + channels - 1 < filter->b_len);
 				assert(data_index1 + channels - 1 < filter->b_end);
 				for (int ch = 0; ch < channels; ch++)
@@ -289,7 +289,7 @@ calc_output_multi_mt_core(const int num_of_threads, const int child_no,
 				const double fraction = fp_to_double(filter_index2);
 				const int indx = fp_to_int(filter_index2);
 				assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2);
-				const double icoeff = filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]);
+				const double icoeff = skip_fraction ? filter->coeffs[indx] : filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]);
 				assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len);
 				assert(data_index2 + channels - 1 < filter->b_end);
 				for (int ch = 0; ch < channels; ch++)
@@ -307,7 +307,7 @@ calc_output_multi_mt_core(const int num_of_threads, const int child_no,
 				const double fraction = fp_to_double(filter_index2);
 				const int indx = fp_to_int(filter_index2);
 				assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2);
-				const double icoeff = filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]);
+				const double icoeff = skip_fraction ? filter->coeffs[indx] : filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]);
 				assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len);
 				assert(data_index2 + channels - 1 < filter->b_end);
 				for (int ch = 0; ch < channels; ch++)
@@ -323,12 +323,28 @@ calc_output_multi_mt_core(const int num_of_threads, const int child_no,
 } /* calc_output_stereo */
 
 __attribute__((always_inline)) static void
-calc_output_multi_mt_2(const int num_of_threads, const int child_no, 
+calc_output_multi_mt_3(const int num_of_threads, const int child_no,
 	const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
 {
-#define OPTIMIZE_LINE(x)                                                                                                  \
-	case (x):                                                                                                             \
-			calc_output_multi_mt_core(num_of_threads, child_no, filter, increment, start_filter_index, x, scale, output); \
+
+	const int skip_fraction = increment == ((increment >> SHIFT_BITS) << SHIFT_BITS) && start_filter_index == ((start_filter_index >> SHIFT_BITS) << SHIFT_BITS) ? 1 : 0;
+
+	if (skip_fraction)
+	{
+			calc_output_multi_mt_core(1, num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, output);
+	}
+	else
+	{
+			calc_output_multi_mt_core(0, num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, output);
+	}
+}
+
+__attribute__((always_inline)) static void
+calc_output_multi_mt_2(const int num_of_threads, const int child_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
+{
+#define OPTIMIZE_LINE(x)                                                                                               \
+	case (x):                                                                                                          \
+			calc_output_multi_mt_3(num_of_threads, child_no, filter, increment, start_filter_index, x, scale, output); \
 			break;
 
 	switch (channels) // to kick the compile-time optimizer, channel numbers up to 16 are extracted as constants here.
@@ -350,7 +366,7 @@ calc_output_multi_mt_2(const int num_of_threads, const int child_no,
 			OPTIMIZE_LINE(15);
 			OPTIMIZE_LINE(16);
 	default:
-			calc_output_multi_mt_core(num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, output);
+			calc_output_multi_mt_3(num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, output);
 			break;
 	}
 #undef OPTIMIZE_LINE
@@ -387,14 +403,11 @@ static SRC_ERROR
 _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, double *const per_thread_data_out,
 	SRC_STATE *const state, SRC_DATA *const data)
 {
-	SINC_FILTER *filter = (SINC_FILTER *)state->private_data;
-	double input_index, src_ratio, count, float_increment, terminate, rem;
-	increment_t increment, start_filter_index;
-	int half_filter_chan_len, samples_in_hand;
-
 	if (state->private_data == NULL)
 			return SRC_ERR_NO_PRIVATE;
 
+	SINC_FILTER *filter = (SINC_FILTER *)state->private_data;
+
 	/* If there is not a problem, this will be optimised out. */
 	if (sizeof(filter->buffer[0]) != sizeof(data->data_in[0]))
 			return SRC_ERR_SIZE_INCOMPATIBILITY;
@@ -404,35 +417,41 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, do
 	filter->out_count = data->output_frames * channels;
 	filter->in_used = filter->out_gen = 0;
 
-	src_ratio = state->last_ratio;
+	double src_ratio = state->last_ratio;
 
 	if (is_bad_src_ratio(src_ratio))
 			return SRC_ERR_BAD_INTERNAL_STATE;
 
 	/* Check the sample rate ratio wrt the buffer len. */
-	count = (filter->coeff_half_len + 2.0) / filter->index_inc;
+	double count = (filter->coeff_half_len + 2.0) / filter->index_inc;
 	if (MIN(state->last_ratio, data->src_ratio) < 1.0)
 			count /= MIN(state->last_ratio, data->src_ratio);
 
 	/* Maximum coefficientson either side of center point. */
-	half_filter_chan_len = state->channels * (int)(psf_lrint(count) + 1);
+	const int half_filter_chan_len = state->channels * (int)(psf_lrint(count) + 1);
 
-	input_index = state->last_position;
+	double input_index = state->last_position;
 
-	rem = fmod_one(input_index);
+	double rem = fmod_one(input_index);
 	filter->b_current = (filter->b_current + channels * psf_lrint(input_index - rem)) % filter->b_len;
 	input_index = rem;
 
-	terminate = 1.0 / src_ratio + 1e-20;
+	const double terminate = 1.0 / src_ratio + 1e-20;
 
 	const long out_count = filter->out_count;
 	const int index_inc = filter->index_inc;
 
+    const int is_constant_ratio = (state->last_ratio == data->src_ratio) ? 1 : 0;
+    const double constant_input_index_inc = 1.0 / src_ratio;
+    const double constant_float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0);
+    const increment_t constant_increment = double_to_fp(constant_float_increment);
+    const double constant_scale = constant_float_increment / index_inc;
+
 	/* Main processing loop. */
 	while (filter->out_gen < out_count)
 	{
 			/* Need to reload buffer? */
-			samples_in_hand = (filter->b_end - filter->b_current + filter->b_len) % filter->b_len;
+			int samples_in_hand = (filter->b_end - filter->b_current + filter->b_len) % filter->b_len;
 
 			if (samples_in_hand <= half_filter_chan_len)
 			{
@@ -451,19 +470,29 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, do
 					break;
 			};
 
-			if (out_count > 0 && fabs(state->last_ratio - data->src_ratio) > 1e-10)
-				src_ratio = state->last_ratio + filter->out_gen * (data->src_ratio - state->last_ratio) / out_count;
-
-			float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0);
-			increment = double_to_fp(float_increment);
-
-			start_filter_index = double_to_fp(input_index * float_increment);
-
-			calc_output_multi_mt(num_of_threads, child_no, filter, increment, start_filter_index, channels, float_increment / index_inc, per_thread_data_out + filter->out_gen);
+            double scale, float_increment;
+            increment_t increment;
+            if ( !is_constant_ratio ){
+                if (out_count > 0 && fabs(state->last_ratio - data->src_ratio) > 1e-10)
+                    src_ratio = state->last_ratio + filter->out_gen * (data->src_ratio - state->last_ratio) / out_count;
+
+                float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0);
+                increment = double_to_fp(float_increment);
+                scale = float_increment / index_inc;
+            }
+            else{
+                float_increment = constant_float_increment;
+                increment = constant_increment;
+                scale = constant_scale;
+            }
+
+			increment_t start_filter_index = double_to_fp(input_index * float_increment);
+
+			calc_output_multi_mt(num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, per_thread_data_out + filter->out_gen);
 			filter->out_gen += channels;
 
 			/* Figure out the next index. */
-			input_index += 1.0 / src_ratio;
+			input_index += (is_constant_ratio) ? constant_input_index_inc : 1.0 / src_ratio;
 			rem = fmod_one(input_index);
 
 			filter->b_current = (filter->b_current + channels * psf_lrint(input_index - rem)) % filter->b_len;

From 797a807267ce536400fd0fdd42457606c030d445 Mon Sep 17 00:00:00 2001
From: ss3git <3226388001@jcom.home.ne.jp>
Date: Sun, 17 Sep 2023 20:41:01 +0900
Subject: [PATCH 05/18] correct indentation

---
 src/src_sinc.c | 76 ++++++++++++++++++++++++++++----------------------
 1 file changed, 42 insertions(+), 34 deletions(-)

diff --git a/src/src_sinc.c b/src/src_sinc.c
index df0e6bc6..4c546e64 100644
--- a/src/src_sinc.c
+++ b/src/src_sinc.c
@@ -441,11 +441,11 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, do
 	const long out_count = filter->out_count;
 	const int index_inc = filter->index_inc;
 
-    const int is_constant_ratio = (state->last_ratio == data->src_ratio) ? 1 : 0;
-    const double constant_input_index_inc = 1.0 / src_ratio;
-    const double constant_float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0);
-    const increment_t constant_increment = double_to_fp(constant_float_increment);
-    const double constant_scale = constant_float_increment / index_inc;
+	const int is_constant_ratio = (state->last_ratio == data->src_ratio) ? 1 : 0;
+	const double constant_input_index_inc = 1.0 / src_ratio;
+	const double constant_float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0);
+	const increment_t constant_increment = double_to_fp(constant_float_increment);
+	const double constant_scale = constant_float_increment / index_inc;
 
 	/* Main processing loop. */
 	while (filter->out_gen < out_count)
@@ -470,21 +470,23 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, do
 					break;
 			};
 
-            double scale, float_increment;
-            increment_t increment;
-            if ( !is_constant_ratio ){
-                if (out_count > 0 && fabs(state->last_ratio - data->src_ratio) > 1e-10)
-                    src_ratio = state->last_ratio + filter->out_gen * (data->src_ratio - state->last_ratio) / out_count;
-
-                float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0);
-                increment = double_to_fp(float_increment);
-                scale = float_increment / index_inc;
-            }
-            else{
-                float_increment = constant_float_increment;
-                increment = constant_increment;
-                scale = constant_scale;
-            }
+			double scale, float_increment;
+			increment_t increment;
+			if (!is_constant_ratio)
+			{
+				if (out_count > 0 && fabs(state->last_ratio - data->src_ratio) > 1e-10)
+					src_ratio = state->last_ratio + filter->out_gen * (data->src_ratio - state->last_ratio) / out_count;
+
+				float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0);
+				increment = double_to_fp(float_increment);
+				scale = float_increment / index_inc;
+			}
+			else
+			{
+				float_increment = constant_float_increment;
+				increment = constant_increment;
+				scale = constant_scale;
+			}
 
 			increment_t start_filter_index = double_to_fp(input_index * float_increment);
 
@@ -530,7 +532,7 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 	SRC_DATA *per_thread_data = (SRC_DATA *)malloc(num_of_threads * sizeof(SRC_DATA));
 	SINC_FILTER *per_thread_filter = (SINC_FILTER *)malloc(num_of_threads * sizeof(SINC_FILTER));
 	SRC_ERROR *per_thread_retval = (SRC_ERROR *)malloc(num_of_threads * sizeof(SRC_ERROR));
-	
+
 	float **per_thread_buffer = (float **)calloc(num_of_threads, sizeof(float *));
 	double **per_thread_data_out = (double **)calloc(num_of_threads, sizeof(double *));
 
@@ -542,10 +544,11 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 			goto cleanup_and_return;
 	}
 
-	if (num_of_threads == 1)		// w/o OpenMP
+	if (num_of_threads == 1) // w/o OpenMP
 	{
 			per_thread_data_out[0] = (double *)malloc(out_count * sizeof(double));
-			if ( !per_thread_data_out[0] ) goto cleanup_and_return;
+			if (!per_thread_data_out[0])
+				goto cleanup_and_return;
 
 			per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, per_thread_data_out[0], state, data);
 
@@ -568,7 +571,8 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 			per_thread_buffer[child_no] = (float *)malloc(filter_buffer_len * sizeof(float));
 			per_thread_data_out[child_no] = (double *)malloc(out_count * sizeof(double));
 
-			if ( !per_thread_buffer[child_no] || !per_thread_data_out[child_no] ){
+			if (!per_thread_buffer[child_no] || !per_thread_data_out[child_no])
+			{
 
 				per_thread_retval[child_no] = SRC_ERR_MALLOC_FAILED;
 
@@ -585,16 +589,18 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 			per_thread_filter[child_no].buffer = per_thread_buffer[child_no];
 
 			per_thread_retval[child_no] = _sinc_multichan_vari_process_mt(
-                num_of_threads, child_no, per_thread_data_out[child_no],
+				num_of_threads, child_no, per_thread_data_out[child_no],
 				&per_thread_state[child_no], &per_thread_data[child_no]);
 	}
 
 	// error checking for each worker
-	for (int child_no = 0; child_no < num_of_threads; child_no++){
-		if ( per_thread_retval[child_no] != SRC_ERR_NO_ERROR ){
-			retval = per_thread_retval[child_no];
-			goto cleanup_and_return;
-		}
+	for (int child_no = 0; child_no < num_of_threads; child_no++)
+	{
+			if (per_thread_retval[child_no] != SRC_ERR_NO_ERROR)
+			{
+				retval = per_thread_retval[child_no];
+				goto cleanup_and_return;
+			}
 	}
 
 	// update filter status
@@ -610,7 +616,7 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 	memcpy(data, &per_thread_data[0], sizeof(SRC_DATA));
 
 #pragma omp parallel for
-	for (int count = 0; count < filter->out_gen; count++)   // sum up every worker's result
+	for (int count = 0; count < filter->out_gen; count++) // sum up every worker's result
 	{
 			double sum = 0.0;
 			for (int child_no = 0; child_no < num_of_threads; child_no++)
@@ -632,7 +638,7 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 
 	if (per_thread_filter)
 			free(per_thread_filter);
-	
+
 	if (per_thread_retval)
 			free(per_thread_retval);
 
@@ -647,8 +653,10 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 			free(per_thread_buffer);
 	}
 
-	if (per_thread_data_out) {
-			for (int child_no = 0; child_no < num_of_threads; child_no++) {
+	if (per_thread_data_out)
+	{
+			for (int child_no = 0; child_no < num_of_threads; child_no++)
+			{
 				if (per_thread_data_out[child_no])
 					free(per_thread_data_out[child_no]);
 			}

From 9c9959dbbea24b36bf786dda39970272511f29f9 Mon Sep 17 00:00:00 2001
From: ss3git <3226388001@jcom.home.ne.jp>
Date: Mon, 13 Nov 2023 06:38:33 +0900
Subject: [PATCH 06/18] - Windows (MSVC) support - eliminate redundant buffer

---
 CMakeLists.txt                    |  10 +-
 src/CMakeLists.txt                |  15 ++-
 src/src_sinc.c                    | 186 ++++++++++++++++++------------
 tests/multichan_throughput_test.c |   6 +-
 tests/throughput_test.c           |   6 +-
 5 files changed, 134 insertions(+), 89 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4a8f2b3f..ef2fb6ad 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,10 @@
 cmake_minimum_required(VERSION 3.1..3.18)
 
+if (BUILD_MULTI_THREADING)
+  set(MULTI_THREADING 1)
+  add_definitions(-DMULTI_THREADING)
+endif()
+
 # Policies
 
 # Include file check macros honor CMAKE_REQUIRED_LIBRARIES, CMake >= 3.12
@@ -72,11 +77,6 @@ endif()
 if(CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
   option(LIBSAMPLERATE_ENABLE_SANITIZERS "Enable ASAN and UBSAN" OFF)
 
-  if (BUILD_MULTI_THREADING)
-    set(MULTI_THREADING 1)
-    add_definitions(-DMULTI_THREADING)
-  endif()
-
   if(LIBSAMPLERATE_ENABLE_SANITIZERS)
     # Use ASAN and UBSAN, make it fail on any error, improve stack traces
     set(sanitizer_flags -fsanitize=address,undefined -fno-sanitize-recover=all -fno-omit-frame-pointer)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6500caf5..21ce661c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -43,6 +43,16 @@ add_library(samplerate
 # ALIAS to use if libsamplerate is included from other project with add_subdirectory()
 add_library(SampleRate::samplerate ALIAS samplerate)
 
+if(MULTI_THREADING)
+  if(WIN32)
+    target_link_libraries(samplerate PRIVATE libomp)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /openmp:llvm")
+  else()
+    target_link_libraries(samplerate PRIVATE omp)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp")
+  endif()
+endif()
+
 # CMake generates wrong DLL names for MinGW and Cygwin, fix it
 if(BUILD_SHARED_LIBS AND WIN32)
   if(LIBSAMPLERATE_COMPATIBLE_NAME)
@@ -72,11 +82,6 @@ if(LIBM_REQUIRED)
     target_link_libraries(samplerate PRIVATE m)
 endif()
 
-if(MULTI_THREADING)
-  target_link_libraries(samplerate PRIVATE omp)
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS_RELEASE} -fopenmp")
-endif()
-
 # Set public header
 set_property(TARGET samplerate PROPERTY PUBLIC_HEADER ${PROJECT_SOURCE_DIR}/include/samplerate.h)
 
diff --git a/src/src_sinc.c b/src/src_sinc.c
index 4c546e64..ce25b3e9 100644
--- a/src/src_sinc.c
+++ b/src/src_sinc.c
@@ -218,13 +218,20 @@ sinc_get_description (int src_enum)
 #ifdef MULTI_THREADING
 
 #include <omp.h>
-#include <unistd.h>
+
+#ifdef _WIN32
+	#define ALWAYS_INLINE __forceinline
+	#include <windows.h>
+#else
+	#define ALWAYS_INLINE __attribute__((always_inline)) static
+	#include <unistd.h>
+#endif
 
 /* smaller frames are processed in single thread to avoid overheads */
 #define MULTI_THREADING_THRESHOLD (256)
 
-__attribute__((always_inline)) static void
-calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, const int child_no, 
+ALWAYS_INLINE void
+calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, const int th_no, 
 	const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
 {
 	assert(num_of_threads == 1 || num_of_threads % 2 == 0);
@@ -235,8 +242,8 @@ calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, con
 	const increment_t max_filter_index = int_to_fp(filter->coeff_half_len);
 
 	const int mt_increment_factor = (num_of_threads > 1) ? num_of_threads / 2 : 1;
-	const int mt_left_right_sw = child_no % 2;
-	const int mt_shift = child_no / 2;
+	const int mt_left_right_sw = th_no % 2;
+	const int mt_shift = th_no / 2;
 
 	if (!mt_left_right_sw || num_of_threads == 1)
 	{
@@ -320,10 +327,10 @@ calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, con
 
 	for (int ch = 0; ch < channels; ch++)
 			output[ch] = (scale * (left[ch] + right[ch])); // double
-} /* calc_output_stereo */
+} /* calc_output_multi_mt_core */
 
-__attribute__((always_inline)) static void
-calc_output_multi_mt_3(const int num_of_threads, const int child_no,
+ALWAYS_INLINE void
+calc_output_multi_mt_3(const int num_of_threads, const int th_no,
 	const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
 {
 
@@ -331,20 +338,20 @@ calc_output_multi_mt_3(const int num_of_threads, const int child_no,
 
 	if (skip_fraction)
 	{
-			calc_output_multi_mt_core(1, num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, output);
+			calc_output_multi_mt_core(1, num_of_threads, th_no, filter, increment, start_filter_index, channels, scale, output);
 	}
 	else
 	{
-			calc_output_multi_mt_core(0, num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, output);
+			calc_output_multi_mt_core(0, num_of_threads, th_no, filter, increment, start_filter_index, channels, scale, output);
 	}
 }
 
-__attribute__((always_inline)) static void
-calc_output_multi_mt_2(const int num_of_threads, const int child_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
+ALWAYS_INLINE void
+calc_output_multi_mt_2(const int num_of_threads, const int th_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
 {
 #define OPTIMIZE_LINE(x)                                                                                               \
 	case (x):                                                                                                          \
-			calc_output_multi_mt_3(num_of_threads, child_no, filter, increment, start_filter_index, x, scale, output); \
+			calc_output_multi_mt_3(num_of_threads, th_no, filter, increment, start_filter_index, x, scale, output); \
 			break;
 
 	switch (channels) // to kick the compile-time optimizer, channel numbers up to 16 are extracted as constants here.
@@ -366,47 +373,43 @@ calc_output_multi_mt_2(const int num_of_threads, const int child_no, const SINC_
 			OPTIMIZE_LINE(15);
 			OPTIMIZE_LINE(16);
 	default:
-			calc_output_multi_mt_3(num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, output);
+			calc_output_multi_mt_3(num_of_threads, th_no, filter, increment, start_filter_index, channels, scale, output);
 			break;
 	}
 #undef OPTIMIZE_LINE
 }
 
-__attribute__((always_inline)) static void
-calc_output_multi_mt(const int num_of_threads, const int child_no, 
+ALWAYS_INLINE void
+calc_output_multi_mt(const int num_of_threads, const int th_no, 
 	const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
 {
 #define OPTIMIZE_LINE(x)                                                                                         \
 	case (x):                                                                                                    \
-			calc_output_multi_mt_2(x, child_no, filter, increment, start_filter_index, channels, scale, output); \
+			calc_output_multi_mt_2(x, th_no, filter, increment, start_filter_index, channels, scale, output); \
 			break;
 
 	switch (num_of_threads) // to kick the compile-time optimizer, the number of threads is extracted as constant here.
 	{
 			OPTIMIZE_LINE(1);
-			OPTIMIZE_LINE(2);
-			OPTIMIZE_LINE(4);
 			OPTIMIZE_LINE(6);
-			OPTIMIZE_LINE(8);
 			OPTIMIZE_LINE(10);
-			OPTIMIZE_LINE(12);
 			OPTIMIZE_LINE(14);
-			OPTIMIZE_LINE(16);
 	default:
-			calc_output_multi_mt_2(num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, output);
+			calc_output_multi_mt_2(num_of_threads, th_no, filter, increment, start_filter_index, channels, scale, output);
 			break;
 	}
 #undef OPTIMIZE_LINE
 }
 
 static SRC_ERROR
-_sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, double *const per_thread_data_out,
-	SRC_STATE *const state, SRC_DATA *const data)
+_sinc_multichan_vari_process_mt(const int NUM_OF_THREADS, const int child_no, const int interleave, double *const per_thread_data_out,
+	SRC_STATE *const state, SRC_DATA *const data, SRC_STATE *const main_state )
 {
 	if (state->private_data == NULL)
 			return SRC_ERR_NO_PRIVATE;
 
 	SINC_FILTER *filter = (SINC_FILTER *)state->private_data;
+	SINC_FILTER *main_filter = (SINC_FILTER *)main_state->private_data;
 
 	/* If there is not a problem, this will be optimised out. */
 	if (sizeof(filter->buffer[0]) != sizeof(data->data_in[0]))
@@ -428,7 +431,7 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, do
 			count /= MIN(state->last_ratio, data->src_ratio);
 
 	/* Maximum coefficientson either side of center point. */
-	const int half_filter_chan_len = state->channels * (int)(psf_lrint(count) + 1);
+	const int half_filter_chan_len = channels * (int)(psf_lrint(count) + 1);
 
 	double input_index = state->last_position;
 
@@ -448,6 +451,8 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, do
 	const double constant_scale = constant_float_increment / index_inc;
 
 	/* Main processing loop. */
+	int interleave_counter = 0;
+	const int interleave_mask = interleave - 1;
 	while (filter->out_gen < out_count)
 	{
 			/* Need to reload buffer? */
@@ -455,7 +460,24 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, do
 
 			if (samples_in_hand <= half_filter_chan_len)
 			{
-				if ((state->error = prepare_data(filter, channels, data, half_filter_chan_len)) != 0)
+				// only one buffer is used (shared by all threads)
+				{
+					#pragma omp barrier
+					#pragma omp single
+					{
+						state->error = prepare_data(filter, channels, data, half_filter_chan_len);
+						
+						*main_state = *state;
+						*main_filter = *filter;
+					}
+					#pragma omp barrier					
+					{
+						*state = *main_state;
+						*filter = *main_filter;
+					}
+				}
+
+				if (state->error != 0)
 					return state->error;
 
 				samples_in_hand = (filter->b_end - filter->b_current + filter->b_len) % filter->b_len;
@@ -490,7 +512,10 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, do
 
 			increment_t start_filter_index = double_to_fp(input_index * float_increment);
 
-			calc_output_multi_mt(num_of_threads, child_no, filter, increment, start_filter_index, channels, scale, per_thread_data_out + filter->out_gen);
+			if ( (interleave_counter & interleave_mask) == (child_no & interleave_mask) ){
+				calc_output_multi_mt(NUM_OF_THREADS/interleave, child_no/interleave, filter, increment, start_filter_index, channels, scale, per_thread_data_out + filter->out_gen/channels/interleave * channels);
+			}
+			interleave_counter = (interleave_counter+1) & interleave_mask;
 			filter->out_gen += channels;
 
 			/* Figure out the next index. */
@@ -506,8 +531,8 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no, do
 	/* Save current ratio rather then target ratio. */
 	state->last_ratio = src_ratio;
 
-	data->input_frames_used = filter->in_used / state->channels;
-	data->output_frames_gen = filter->out_gen / state->channels;
+	data->input_frames_used = filter->in_used / channels;
+	data->output_frames_gen = filter->out_gen / channels;
 
 	return SRC_ERR_NO_ERROR;
 }
@@ -518,43 +543,65 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 	if (state->private_data == NULL)
 			return SRC_ERR_NO_PRIVATE;
 
-	const long in_count = data->input_frames * state->channels;
-	const long out_count = data->output_frames * state->channels;
+	const int channels = state->channels;
+
+	const long in_count = data->input_frames * channels;
+	const long out_count = data->output_frames * channels;
 
 	SINC_FILTER *filter = (SINC_FILTER *)state->private_data;
-	const int filter_buffer_len = (filter->b_len + state->channels);
+	const int filter_buffer_len = (filter->b_len + channels);
 
-	const int should_be_single_thread = (sysconf(_SC_NPROCESSORS_ONLN) < 2 || in_count < MULTI_THREADING_THRESHOLD);
+#ifdef _WIN32
+	SYSTEM_INFO sysinfo;
+	GetSystemInfo(&sysinfo);
+	const int N_OF_CORES = sysinfo.dwNumberOfProcessors;
+#else
+	const int N_OF_CORES = sysconf(_SC_NPROCESSORS_ONLN);
+#endif
 
-	const int num_of_threads = should_be_single_thread ? 1 : (sysconf(_SC_NPROCESSORS_ONLN) / 2 * 2);
+	const int should_be_single_thread = (N_OF_CORES < 2 || in_count < MULTI_THREADING_THRESHOLD);
+	const int NUM_OF_THREADS = should_be_single_thread ? 1 : (N_OF_CORES / 2 * 2);
 
-	SRC_STATE *per_thread_state = (SRC_STATE *)malloc(num_of_threads * sizeof(SRC_STATE));
-	SRC_DATA *per_thread_data = (SRC_DATA *)malloc(num_of_threads * sizeof(SRC_DATA));
-	SINC_FILTER *per_thread_filter = (SINC_FILTER *)malloc(num_of_threads * sizeof(SINC_FILTER));
-	SRC_ERROR *per_thread_retval = (SRC_ERROR *)malloc(num_of_threads * sizeof(SRC_ERROR));
+	int _interleave = 1;
+	while (	((NUM_OF_THREADS / _interleave / 2) % 2 == 0 || (NUM_OF_THREADS / _interleave / 2) == 1) 
+		&& _interleave * 2 <= NUM_OF_THREADS)
+	{
+			_interleave *= 2;
+	}
+	
+	const int interleave = _interleave;
+
+	assert( NUM_OF_THREADS % interleave == 0 );
+	assert( (NUM_OF_THREADS / interleave) % 2 == 0 );
+
+	const int interleave_mask = interleave - 1;
+
+	SRC_STATE *per_thread_state = (SRC_STATE *)malloc(NUM_OF_THREADS * sizeof(SRC_STATE));
+	SRC_DATA *per_thread_data = (SRC_DATA *)malloc(NUM_OF_THREADS * sizeof(SRC_DATA));
+	SINC_FILTER *per_thread_filter = (SINC_FILTER *)malloc(NUM_OF_THREADS * sizeof(SINC_FILTER));
+	SRC_ERROR *per_thread_retval = (SRC_ERROR *)malloc(NUM_OF_THREADS * sizeof(SRC_ERROR));
 
-	float **per_thread_buffer = (float **)calloc(num_of_threads, sizeof(float *));
-	double **per_thread_data_out = (double **)calloc(num_of_threads, sizeof(double *));
+	double **per_thread_data_out = (double **)calloc(NUM_OF_THREADS, sizeof(double *));
 
 	SRC_ERROR retval = SRC_ERR_MALLOC_FAILED;
 
 	if ( !per_thread_state || !per_thread_data || !per_thread_filter
-		 || !per_thread_buffer || !per_thread_data_out || !per_thread_retval )
+		 || !per_thread_data_out || !per_thread_retval )
 	{
 			goto cleanup_and_return;
 	}
 
-	if (num_of_threads == 1) // w/o OpenMP
+	if (NUM_OF_THREADS == 1) // w/o OpenMP
 	{
 			per_thread_data_out[0] = (double *)malloc(out_count * sizeof(double));
 			if (!per_thread_data_out[0])
 				goto cleanup_and_return;
 
-			per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, per_thread_data_out[0], state, data);
+			per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, 1, per_thread_data_out[0], state, data, state);
 
 			for (int count = 0; count < filter->out_gen; count++)
 			{
-				data->data_out[count] = per_thread_data_out[0][count];
+				data->data_out[count] = (float)per_thread_data_out[0][count];
 			}
 
 			retval = per_thread_retval[0];
@@ -563,15 +610,17 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 	}
 
 	// OpenMP
-	omp_set_num_threads(num_of_threads);
+	omp_set_num_threads(NUM_OF_THREADS);
+
+	int omp_child_no;
 
 #pragma omp parallel for
-	for (int child_no = 0; child_no < num_of_threads; child_no++)
+	for (omp_child_no = 0; omp_child_no < NUM_OF_THREADS; omp_child_no++)
 	{
-			per_thread_buffer[child_no] = (float *)malloc(filter_buffer_len * sizeof(float));
-			per_thread_data_out[child_no] = (double *)malloc(out_count * sizeof(double));
+			const int child_no = omp_child_no;
+			per_thread_data_out[child_no] = (double *)malloc((out_count/interleave + channels) * sizeof(double));
 
-			if (!per_thread_buffer[child_no] || !per_thread_data_out[child_no])
+			if ( !per_thread_data_out[child_no])
 			{
 
 				per_thread_retval[child_no] = SRC_ERR_MALLOC_FAILED;
@@ -585,16 +634,15 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 			memcpy(&per_thread_state[child_no], state, sizeof(SRC_STATE));
 			per_thread_state[child_no].private_data = &per_thread_filter[child_no];
 
-			memcpy(per_thread_buffer[child_no], filter->buffer, filter_buffer_len * sizeof(float));
-			per_thread_filter[child_no].buffer = per_thread_buffer[child_no];
+			per_thread_filter[child_no].buffer = filter->buffer;
 
 			per_thread_retval[child_no] = _sinc_multichan_vari_process_mt(
-				num_of_threads, child_no, per_thread_data_out[child_no],
-				&per_thread_state[child_no], &per_thread_data[child_no]);
+				NUM_OF_THREADS, child_no, interleave, per_thread_data_out[child_no],
+				&per_thread_state[child_no], &per_thread_data[child_no], state);
 	}
 
 	// error checking for each worker
-	for (int child_no = 0; child_no < num_of_threads; child_no++)
+	for (int child_no = 0; child_no < NUM_OF_THREADS; child_no++)
 	{
 			if (per_thread_retval[child_no] != SRC_ERR_NO_ERROR)
 			{
@@ -604,24 +652,27 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 	}
 
 	// update filter status
-	memcpy(filter->buffer, per_thread_buffer[0], filter_buffer_len * sizeof(float));
-
 	float *buf = filter->buffer;
 	memcpy(filter, &per_thread_filter[0], sizeof(SINC_FILTER));
 	filter->buffer = buf;
 
 	memcpy(state, &per_thread_state[0], sizeof(SRC_STATE));
 	state->private_data = filter;
-
+	
 	memcpy(data, &per_thread_data[0], sizeof(SRC_DATA));
 
+	int omp_count;
+
 #pragma omp parallel for
-	for (int count = 0; count < filter->out_gen; count++) // sum up every worker's result
+	for (omp_count = 0; omp_count < filter->out_gen; omp_count++) // sum up every worker's result
 	{
+			const int count = omp_count;
+			const int interleave_counter = count/channels;
 			double sum = 0.0;
-			for (int child_no = 0; child_no < num_of_threads; child_no++)
+			for (int c_no = 0; c_no < NUM_OF_THREADS/interleave; c_no++)
 			{
-				sum += per_thread_data_out[child_no][count];
+				const int child_no = (c_no * interleave) + (interleave_counter & interleave_mask);
+				sum += per_thread_data_out[child_no][interleave_counter/interleave * channels];
 			}
 			data->data_out[count] = (float)sum;
 	}
@@ -642,20 +693,9 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 	if (per_thread_retval)
 			free(per_thread_retval);
 
-	if (per_thread_buffer)
-	{
-			for (int child_no = 0; child_no < num_of_threads; child_no++)
-			{
-				if (per_thread_buffer[child_no])
-					free(per_thread_buffer[child_no]);
-			}
-
-			free(per_thread_buffer);
-	}
-
 	if (per_thread_data_out)
 	{
-			for (int child_no = 0; child_no < num_of_threads; child_no++)
+			for (int child_no = 0; child_no < NUM_OF_THREADS; child_no++)
 			{
 				if (per_thread_data_out[child_no])
 					free(per_thread_data_out[child_no]);
diff --git a/tests/multichan_throughput_test.c b/tests/multichan_throughput_test.c
index 9cf5639b..7c88852b 100644
--- a/tests/multichan_throughput_test.c
+++ b/tests/multichan_throughput_test.c
@@ -41,7 +41,7 @@ static float output [BUFFER_LEN] ;
 static void
 throughput_test (int converter, int channels, long *best_throughput)
 {	SRC_DATA src_data ;
-#ifdef MULTI_THREADING
+#if !defined(_WIN32) && defined(MULTI_THREADING)
 	struct timespec start_gettime, finish_gettime;
 #else
 	clock_t start_time, clock_time ;
@@ -67,7 +67,7 @@ throughput_test (int converter, int channels, long *best_throughput)
 	sleep (2) ;
 #endif
 
-#ifdef MULTI_THREADING
+#if !defined(_WIN32) && defined(MULTI_THREADING)
 	clock_gettime(CLOCK_MONOTONIC, &start_gettime);
 #else
 	start_time = clock () ;
@@ -82,7 +82,7 @@ throughput_test (int converter, int channels, long *best_throughput)
 
 		total_frames += src_data.output_frames_gen ;
 
-#ifdef MULTI_THREADING
+#if !defined(_WIN32) && defined(MULTI_THREADING)
 		clock_gettime(CLOCK_MONOTONIC, &finish_gettime);
 
 		duration = (finish_gettime.tv_sec - start_gettime.tv_sec);
diff --git a/tests/throughput_test.c b/tests/throughput_test.c
index 0a6aa834..f4181897 100644
--- a/tests/throughput_test.c
+++ b/tests/throughput_test.c
@@ -38,7 +38,7 @@ static float output [BUFFER_LEN] ;
 static long
 throughput_test (int converter, long best_throughput)
 {	SRC_DATA src_data ;
-#ifdef MULTI_THREADING
+#if !defined(_WIN32) && defined(MULTI_THREADING)
 	struct timespec start_gettime, finish_gettime;
 #else
 	clock_t start_time, clock_time ;
@@ -64,7 +64,7 @@ throughput_test (int converter, long best_throughput)
 	sleep (2) ;
 #endif
 
-#ifdef MULTI_THREADING
+#if !defined(_WIN32) && defined(MULTI_THREADING)
 	clock_gettime(CLOCK_MONOTONIC, &start_gettime);
 #else
 	start_time = clock () ;
@@ -79,7 +79,7 @@ throughput_test (int converter, long best_throughput)
 
 		total_frames += src_data.output_frames_gen ;
 
-#ifdef MULTI_THREADING
+#if !defined(_WIN32) && defined(MULTI_THREADING)
 		clock_gettime(CLOCK_MONOTONIC, &finish_gettime);
 
 		duration = (finish_gettime.tv_sec - start_gettime.tv_sec);

From 7c3ec94f0d64b45aaa1014781ab27208aea9b7a8 Mon Sep 17 00:00:00 2001
From: ss3git <3226388001@jcom.home.ne.jp>
Date: Tue, 14 Nov 2023 20:21:15 +0900
Subject: [PATCH 07/18] ensure the number of threads actually used by omp to
 have the barrier logic work correctly.

---
 src/src_sinc.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/src/src_sinc.c b/src/src_sinc.c
index ce25b3e9..5825191d 100644
--- a/src/src_sinc.c
+++ b/src/src_sinc.c
@@ -221,10 +221,10 @@ sinc_get_description (int src_enum)
 
 #ifdef _WIN32
 	#define ALWAYS_INLINE __forceinline
-	#include <windows.h>
+	//#include <windows.h>
 #else
 	#define ALWAYS_INLINE __attribute__((always_inline)) static
-	#include <unistd.h>
+	//#include <unistd.h>
 #endif
 
 /* smaller frames are processed in single thread to avoid overheads */
@@ -551,13 +551,7 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 	SINC_FILTER *filter = (SINC_FILTER *)state->private_data;
 	const int filter_buffer_len = (filter->b_len + channels);
 
-#ifdef _WIN32
-	SYSTEM_INFO sysinfo;
-	GetSystemInfo(&sysinfo);
-	const int N_OF_CORES = sysinfo.dwNumberOfProcessors;
-#else
-	const int N_OF_CORES = sysconf(_SC_NPROCESSORS_ONLN);
-#endif
+	const int N_OF_CORES = omp_get_num_procs();
 
 	const int should_be_single_thread = (N_OF_CORES < 2 || in_count < MULTI_THREADING_THRESHOLD);
 	const int NUM_OF_THREADS = should_be_single_thread ? 1 : (N_OF_CORES / 2 * 2);
@@ -591,6 +585,12 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 			goto cleanup_and_return;
 	}
 
+	// OpenMP
+	omp_set_dynamic(0);
+	omp_set_num_threads(NUM_OF_THREADS);
+
+	assert( NUM_OF_THREADS == omp_get_num_threads() );
+
 	if (NUM_OF_THREADS == 1) // w/o OpenMP
 	{
 			per_thread_data_out[0] = (double *)malloc(out_count * sizeof(double));
@@ -609,9 +609,6 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 			goto cleanup_and_return;
 	}
 
-	// OpenMP
-	omp_set_num_threads(NUM_OF_THREADS);
-
 	int omp_child_no;
 
 #pragma omp parallel for

From 58c7d5f4aae812d26e9aeb7de8eb89515838b016 Mon Sep 17 00:00:00 2001
From: ss3git <3226388001@jcom.home.ne.jp>
Date: Tue, 14 Nov 2023 21:02:19 +0900
Subject: [PATCH 08/18] change threading strategy because overhead of
 allocating extra work buffer for each thread seems to be larger in fewer
 channel process cases.

---
 src/src_sinc.c | 171 +++++++++++--------------------------------------
 1 file changed, 38 insertions(+), 133 deletions(-)

diff --git a/src/src_sinc.c b/src/src_sinc.c
index 5825191d..cfbbeb5a 100644
--- a/src/src_sinc.c
+++ b/src/src_sinc.c
@@ -231,21 +231,15 @@ sinc_get_description (int src_enum)
 #define MULTI_THREADING_THRESHOLD (256)
 
 ALWAYS_INLINE void
-calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, const int th_no, 
-	const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
+calc_output_multi_mt_core(const int skip_fraction, const SINC_FILTER * const filter, 
+	const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float * const output)
 {
-	assert(num_of_threads == 1 || num_of_threads % 2 == 0);
 	double left[MAX_CHANNELS] = {0};
 	double right[MAX_CHANNELS] = {0};
 
 	/* Convert input parameters into fixed point. */
 	const increment_t max_filter_index = int_to_fp(filter->coeff_half_len);
 
-	const int mt_increment_factor = (num_of_threads > 1) ? num_of_threads / 2 : 1;
-	const int mt_left_right_sw = th_no % 2;
-	const int mt_shift = th_no / 2;
-
-	if (!mt_left_right_sw || num_of_threads == 1)
 	{
 			/* First apply the left half of the filter. */
 			increment_t filter_index1 = start_filter_index;
@@ -262,9 +256,6 @@ calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, con
 				data_index1 += steps * channels;
 			}
 
-			filter_index1 -= increment * mt_shift;
-			data_index1 = data_index1 + channels * mt_shift;
-
 			// left = 0.0;
 			while (filter_index1 >= MAKE_INCREMENT_T(0))
 			{
@@ -277,12 +268,11 @@ calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, con
 				for (int ch = 0; ch < channels; ch++)
 					left[ch] += icoeff * filter->buffer[data_index1 + ch];
 
-				filter_index1 -= increment * mt_increment_factor;
-				data_index1 = data_index1 + channels * mt_increment_factor;
+				filter_index1 -= increment;
+				data_index1 = data_index1 + channels;
 			};
 	}
 
-	if (mt_left_right_sw || num_of_threads == 1)
 	{
 			/* Now apply the right half of the filter. */
 			increment_t filter_index2 = increment - start_filter_index;
@@ -291,7 +281,6 @@ calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, con
 			int data_index2 = filter->b_current + channels * (1 + coeff_count2);
 			// right = 0.0;
 
-			if (!mt_shift)
 			{
 				const double fraction = fp_to_double(filter_index2);
 				const int indx = fp_to_int(filter_index2);
@@ -306,9 +295,6 @@ calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, con
 			filter_index2 -= increment;
 			data_index2 = data_index2 - channels;
 
-			filter_index2 -= increment * mt_shift;
-			data_index2 = data_index2 - channels * mt_shift;
-
 			while (filter_index2 > MAKE_INCREMENT_T(0))
 			{
 				const double fraction = fp_to_double(filter_index2);
@@ -320,38 +306,38 @@ calc_output_multi_mt_core(const int skip_fraction, const int num_of_threads, con
 				for (int ch = 0; ch < channels; ch++)
 					right[ch] += icoeff * filter->buffer[data_index2 + ch];
 
-				filter_index2 -= increment * mt_increment_factor;
-				data_index2 = data_index2 - channels * mt_increment_factor;
+				filter_index2 -= increment;
+				data_index2 = data_index2 - channels;
 			}
 	}
 
 	for (int ch = 0; ch < channels; ch++)
-			output[ch] = (scale * (left[ch] + right[ch])); // double
+			output[ch] = (float)(scale * (left[ch] + right[ch]));
 } /* calc_output_multi_mt_core */
 
 ALWAYS_INLINE void
-calc_output_multi_mt_3(const int num_of_threads, const int th_no,
-	const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
+calc_output_multi_mt_2(
+	const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output)
 {
 
 	const int skip_fraction = increment == ((increment >> SHIFT_BITS) << SHIFT_BITS) && start_filter_index == ((start_filter_index >> SHIFT_BITS) << SHIFT_BITS) ? 1 : 0;
 
 	if (skip_fraction)
 	{
-			calc_output_multi_mt_core(1, num_of_threads, th_no, filter, increment, start_filter_index, channels, scale, output);
+			calc_output_multi_mt_core(1, filter, increment, start_filter_index, channels, scale, output);
 	}
 	else
 	{
-			calc_output_multi_mt_core(0, num_of_threads, th_no, filter, increment, start_filter_index, channels, scale, output);
+			calc_output_multi_mt_core(0, filter, increment, start_filter_index, channels, scale, output);
 	}
 }
 
 ALWAYS_INLINE void
-calc_output_multi_mt_2(const int num_of_threads, const int th_no, const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
+calc_output_multi_mt(const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output)
 {
 #define OPTIMIZE_LINE(x)                                                                                               \
 	case (x):                                                                                                          \
-			calc_output_multi_mt_3(num_of_threads, th_no, filter, increment, start_filter_index, x, scale, output); \
+			calc_output_multi_mt_2(filter, increment, start_filter_index, x, scale, output); \
 			break;
 
 	switch (channels) // to kick the compile-time optimizer, channel numbers up to 16 are extracted as constants here.
@@ -373,37 +359,15 @@ calc_output_multi_mt_2(const int num_of_threads, const int th_no, const SINC_FIL
 			OPTIMIZE_LINE(15);
 			OPTIMIZE_LINE(16);
 	default:
-			calc_output_multi_mt_3(num_of_threads, th_no, filter, increment, start_filter_index, channels, scale, output);
-			break;
-	}
-#undef OPTIMIZE_LINE
-}
-
-ALWAYS_INLINE void
-calc_output_multi_mt(const int num_of_threads, const int th_no, 
-	const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, double *const output)
-{
-#define OPTIMIZE_LINE(x)                                                                                         \
-	case (x):                                                                                                    \
-			calc_output_multi_mt_2(x, th_no, filter, increment, start_filter_index, channels, scale, output); \
-			break;
-
-	switch (num_of_threads) // to kick the compile-time optimizer, the number of threads is extracted as constant here.
-	{
-			OPTIMIZE_LINE(1);
-			OPTIMIZE_LINE(6);
-			OPTIMIZE_LINE(10);
-			OPTIMIZE_LINE(14);
-	default:
-			calc_output_multi_mt_2(num_of_threads, th_no, filter, increment, start_filter_index, channels, scale, output);
+			calc_output_multi_mt_2(filter, increment, start_filter_index, channels, scale, output);
 			break;
 	}
 #undef OPTIMIZE_LINE
 }
 
 static SRC_ERROR
-_sinc_multichan_vari_process_mt(const int NUM_OF_THREADS, const int child_no, const int interleave, double *const per_thread_data_out,
-	SRC_STATE *const state, SRC_DATA *const data, SRC_STATE *const main_state )
+_sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no,
+	SRC_STATE * const state, SRC_DATA * const data, SRC_STATE *const main_state )
 {
 	if (state->private_data == NULL)
 			return SRC_ERR_NO_PRIVATE;
@@ -452,11 +416,12 @@ _sinc_multichan_vari_process_mt(const int NUM_OF_THREADS, const int child_no, co
 
 	/* Main processing loop. */
 	int interleave_counter = 0;
-	const int interleave_mask = interleave - 1;
+	float * const data_out = data->data_out;
+	
 	while (filter->out_gen < out_count)
 	{
 			/* Need to reload buffer? */
-			int samples_in_hand = (filter->b_end - filter->b_current + filter->b_len) % filter->b_len;
+			int samples_in_hand = ( filter->b_end < filter->b_current ) ? (filter->b_end - filter->b_current + filter->b_len) : (filter->b_end - filter->b_current);
 
 			if (samples_in_hand <= half_filter_chan_len)
 			{
@@ -480,7 +445,7 @@ _sinc_multichan_vari_process_mt(const int NUM_OF_THREADS, const int child_no, co
 				if (state->error != 0)
 					return state->error;
 
-				samples_in_hand = (filter->b_end - filter->b_current + filter->b_len) % filter->b_len;
+				samples_in_hand = ( filter->b_end < filter->b_current ) ? (filter->b_end - filter->b_current + filter->b_len) : (filter->b_end - filter->b_current);
 				if (samples_in_hand <= half_filter_chan_len)
 					break;
 			};
@@ -512,17 +477,18 @@ _sinc_multichan_vari_process_mt(const int NUM_OF_THREADS, const int child_no, co
 
 			increment_t start_filter_index = double_to_fp(input_index * float_increment);
 
-			if ( (interleave_counter & interleave_mask) == (child_no & interleave_mask) ){
-				calc_output_multi_mt(NUM_OF_THREADS/interleave, child_no/interleave, filter, increment, start_filter_index, channels, scale, per_thread_data_out + filter->out_gen/channels/interleave * channels);
+			if ( child_no == interleave_counter ){
+				calc_output_multi_mt(filter, increment, start_filter_index, channels, scale, data_out + filter->out_gen);
 			}
-			interleave_counter = (interleave_counter+1) & interleave_mask;
+			if ( ++interleave_counter == num_of_threads ) interleave_counter = 0;
 			filter->out_gen += channels;
 
 			/* Figure out the next index. */
 			input_index += (is_constant_ratio) ? constant_input_index_inc : 1.0 / src_ratio;
 			rem = fmod_one(input_index);
 
-			filter->b_current = (filter->b_current + channels * psf_lrint(input_index - rem)) % filter->b_len;
+			filter->b_current = (filter->b_current + channels * psf_lrint(input_index - rem));
+			if ( filter->b_current >= filter->b_len ) filter->b_current -= filter->b_len;
 			input_index = rem;
 	};
 
@@ -554,55 +520,30 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 	const int N_OF_CORES = omp_get_num_procs();
 
 	const int should_be_single_thread = (N_OF_CORES < 2 || in_count < MULTI_THREADING_THRESHOLD);
-	const int NUM_OF_THREADS = should_be_single_thread ? 1 : (N_OF_CORES / 2 * 2);
-
-	int _interleave = 1;
-	while (	((NUM_OF_THREADS / _interleave / 2) % 2 == 0 || (NUM_OF_THREADS / _interleave / 2) == 1) 
-		&& _interleave * 2 <= NUM_OF_THREADS)
-	{
-			_interleave *= 2;
-	}
-	
-	const int interleave = _interleave;
-
-	assert( NUM_OF_THREADS % interleave == 0 );
-	assert( (NUM_OF_THREADS / interleave) % 2 == 0 );
-
-	const int interleave_mask = interleave - 1;
+	const int num_of_threads = should_be_single_thread ? 1 : (N_OF_CORES / 2 * 2);
 
-	SRC_STATE *per_thread_state = (SRC_STATE *)malloc(NUM_OF_THREADS * sizeof(SRC_STATE));
-	SRC_DATA *per_thread_data = (SRC_DATA *)malloc(NUM_OF_THREADS * sizeof(SRC_DATA));
-	SINC_FILTER *per_thread_filter = (SINC_FILTER *)malloc(NUM_OF_THREADS * sizeof(SINC_FILTER));
-	SRC_ERROR *per_thread_retval = (SRC_ERROR *)malloc(NUM_OF_THREADS * sizeof(SRC_ERROR));
-
-	double **per_thread_data_out = (double **)calloc(NUM_OF_THREADS, sizeof(double *));
+	SRC_STATE *per_thread_state = (SRC_STATE *)malloc(num_of_threads * sizeof(SRC_STATE));
+	SRC_DATA *per_thread_data = (SRC_DATA *)malloc(num_of_threads * sizeof(SRC_DATA));
+	SINC_FILTER *per_thread_filter = (SINC_FILTER *)malloc(num_of_threads * sizeof(SINC_FILTER));
+	SRC_ERROR *per_thread_retval = (SRC_ERROR *)malloc(num_of_threads * sizeof(SRC_ERROR));
 
 	SRC_ERROR retval = SRC_ERR_MALLOC_FAILED;
 
 	if ( !per_thread_state || !per_thread_data || !per_thread_filter
-		 || !per_thread_data_out || !per_thread_retval )
+		 || !per_thread_retval )
 	{
 			goto cleanup_and_return;
 	}
 
 	// OpenMP
 	omp_set_dynamic(0);
-	omp_set_num_threads(NUM_OF_THREADS);
+	omp_set_num_threads(num_of_threads);
 
-	assert( NUM_OF_THREADS == omp_get_num_threads() );
+	assert( num_of_threads == omp_get_num_threads() );
 
-	if (NUM_OF_THREADS == 1) // w/o OpenMP
+	if (num_of_threads == 1) // w/o OpenMP
 	{
-			per_thread_data_out[0] = (double *)malloc(out_count * sizeof(double));
-			if (!per_thread_data_out[0])
-				goto cleanup_and_return;
-
-			per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, 1, per_thread_data_out[0], state, data, state);
-
-			for (int count = 0; count < filter->out_gen; count++)
-			{
-				data->data_out[count] = (float)per_thread_data_out[0][count];
-			}
+			per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, state, data, state);
 
 			retval = per_thread_retval[0];
 
@@ -612,18 +553,9 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 	int omp_child_no;
 
 #pragma omp parallel for
-	for (omp_child_no = 0; omp_child_no < NUM_OF_THREADS; omp_child_no++)
+	for (omp_child_no = 0; omp_child_no < num_of_threads; omp_child_no++)
 	{
 			const int child_no = omp_child_no;
-			per_thread_data_out[child_no] = (double *)malloc((out_count/interleave + channels) * sizeof(double));
-
-			if ( !per_thread_data_out[child_no])
-			{
-
-				per_thread_retval[child_no] = SRC_ERR_MALLOC_FAILED;
-
-				continue;
-			}
 
 			memcpy(&per_thread_data[child_no], data, sizeof(SRC_DATA));
 			memcpy(&per_thread_filter[child_no], filter, sizeof(SINC_FILTER));
@@ -634,12 +566,12 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 			per_thread_filter[child_no].buffer = filter->buffer;
 
 			per_thread_retval[child_no] = _sinc_multichan_vari_process_mt(
-				NUM_OF_THREADS, child_no, interleave, per_thread_data_out[child_no],
+				num_of_threads, child_no, 
 				&per_thread_state[child_no], &per_thread_data[child_no], state);
 	}
 
 	// error checking for each worker
-	for (int child_no = 0; child_no < NUM_OF_THREADS; child_no++)
+	for (int child_no = 0; child_no < num_of_threads; child_no++)
 	{
 			if (per_thread_retval[child_no] != SRC_ERR_NO_ERROR)
 			{
@@ -658,22 +590,6 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 	
 	memcpy(data, &per_thread_data[0], sizeof(SRC_DATA));
 
-	int omp_count;
-
-#pragma omp parallel for
-	for (omp_count = 0; omp_count < filter->out_gen; omp_count++) // sum up every worker's result
-	{
-			const int count = omp_count;
-			const int interleave_counter = count/channels;
-			double sum = 0.0;
-			for (int c_no = 0; c_no < NUM_OF_THREADS/interleave; c_no++)
-			{
-				const int child_no = (c_no * interleave) + (interleave_counter & interleave_mask);
-				sum += per_thread_data_out[child_no][interleave_counter/interleave * channels];
-			}
-			data->data_out[count] = (float)sum;
-	}
-
 	retval = SRC_ERR_NO_ERROR;
 
 cleanup_and_return:
@@ -690,17 +606,6 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 	if (per_thread_retval)
 			free(per_thread_retval);
 
-	if (per_thread_data_out)
-	{
-			for (int child_no = 0; child_no < NUM_OF_THREADS; child_no++)
-			{
-				if (per_thread_data_out[child_no])
-					free(per_thread_data_out[child_no]);
-			}
-
-			free(per_thread_data_out);
-	}
-
 	return retval;
 }
 

From 9bce93e7e9373e4643467f5109b05a269ea429f3 Mon Sep 17 00:00:00 2001
From: ss3git <3226388001@jcom.home.ne.jp>
Date: Thu, 16 Nov 2023 18:20:14 +0900
Subject: [PATCH 09/18] allow odd numbers of threads prefetch coefft to cache
 memory

---
 src/src_sinc.c | 66 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 43 insertions(+), 23 deletions(-)

diff --git a/src/src_sinc.c b/src/src_sinc.c
index cfbbeb5a..5fc8b8bf 100644
--- a/src/src_sinc.c
+++ b/src/src_sinc.c
@@ -221,17 +221,26 @@ sinc_get_description (int src_enum)
 
 #ifdef _WIN32
 	#define ALWAYS_INLINE __forceinline
-	//#include <windows.h>
-#else
+	#include <xmmintrin.h>
+
+	#define mem_prefetch(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+
+#elif defined(__GNUC__) || defined(clang)
 	#define ALWAYS_INLINE __attribute__((always_inline)) static
-	//#include <unistd.h>
+
+	#define mem_prefetch(ptr) __builtin_prefetch(ptr)
+	
+#else
+	#define ALWAYS_INLINE static
+
+	#define mem_prefetch(ptr) 
 #endif
 
 /* smaller frames are processed in single thread to avoid overheads */
 #define MULTI_THREADING_THRESHOLD (256)
 
 ALWAYS_INLINE void
-calc_output_multi_mt_core(const int skip_fraction, const SINC_FILTER * const filter, 
+calc_output_multi_mt_core(const int enable_prefetch, const int skip_fraction, const SINC_FILTER * const filter, 
 	const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float * const output)
 {
 	double left[MAX_CHANNELS] = {0};
@@ -240,6 +249,8 @@ calc_output_multi_mt_core(const int skip_fraction, const SINC_FILTER * const fil
 	/* Convert input parameters into fixed point. */
 	const increment_t max_filter_index = int_to_fp(filter->coeff_half_len);
 
+	const int prefetch_increment = 8;
+
 	{
 			/* First apply the left half of the filter. */
 			increment_t filter_index1 = start_filter_index;
@@ -259,6 +270,12 @@ calc_output_multi_mt_core(const int skip_fraction, const SINC_FILTER * const fil
 			// left = 0.0;
 			while (filter_index1 >= MAKE_INCREMENT_T(0))
 			{
+				if ( enable_prefetch && filter_index1 - increment * prefetch_increment >= MAKE_INCREMENT_T(0) ){
+					const int indx = fp_to_int(filter_index1 - increment * prefetch_increment);
+					mem_prefetch(&filter->coeffs[indx]);
+					mem_prefetch(&filter->coeffs[indx+1]);
+				}
+
 				const double fraction = fp_to_double(filter_index1);
 				const int indx = fp_to_int(filter_index1);
 				assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2);
@@ -281,22 +298,13 @@ calc_output_multi_mt_core(const int skip_fraction, const SINC_FILTER * const fil
 			int data_index2 = filter->b_current + channels * (1 + coeff_count2);
 			// right = 0.0;
 
-			{
-				const double fraction = fp_to_double(filter_index2);
-				const int indx = fp_to_int(filter_index2);
-				assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2);
-				const double icoeff = skip_fraction ? filter->coeffs[indx] : filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]);
-				assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len);
-				assert(data_index2 + channels - 1 < filter->b_end);
-				for (int ch = 0; ch < channels; ch++)
-					right[ch] += icoeff * filter->buffer[data_index2 + ch];
-			}
-
-			filter_index2 -= increment;
-			data_index2 = data_index2 - channels;
+			do {
+				if ( enable_prefetch && filter_index2 - increment * prefetch_increment > MAKE_INCREMENT_T(0) ){
+					const int indx = fp_to_int(filter_index2 - increment * prefetch_increment);
+					mem_prefetch(&filter->coeffs[indx]);
+					mem_prefetch(&filter->coeffs[indx+1]);
+				}
 
-			while (filter_index2 > MAKE_INCREMENT_T(0))
-			{
 				const double fraction = fp_to_double(filter_index2);
 				const int indx = fp_to_int(filter_index2);
 				assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2);
@@ -308,7 +316,8 @@ calc_output_multi_mt_core(const int skip_fraction, const SINC_FILTER * const fil
 
 				filter_index2 -= increment;
 				data_index2 = data_index2 - channels;
-			}
+			} while (filter_index2 > MAKE_INCREMENT_T(0));
+
 	}
 
 	for (int ch = 0; ch < channels; ch++)
@@ -321,14 +330,25 @@ calc_output_multi_mt_2(
 {
 
 	const int skip_fraction = increment == ((increment >> SHIFT_BITS) << SHIFT_BITS) && start_filter_index == ((start_filter_index >> SHIFT_BITS) << SHIFT_BITS) ? 1 : 0;
+	const int enable_prefetch = (filter->coeff_half_len > ARRAY_LEN (slow_mid_qual_coeffs.coeffs));
 
 	if (skip_fraction)
 	{
-			calc_output_multi_mt_core(1, filter, increment, start_filter_index, channels, scale, output);
+		if (enable_prefetch){
+			calc_output_multi_mt_core(1, 1, filter, increment, start_filter_index, channels, scale, output);
+		}
+		else{
+			calc_output_multi_mt_core(0, 1, filter, increment, start_filter_index, channels, scale, output);
+		}
 	}
 	else
 	{
-			calc_output_multi_mt_core(0, filter, increment, start_filter_index, channels, scale, output);
+		if (enable_prefetch){
+			calc_output_multi_mt_core(1, 0, filter, increment, start_filter_index, channels, scale, output);
+		}
+		else{
+			calc_output_multi_mt_core(0, 0, filter, increment, start_filter_index, channels, scale, output);
+		}
 	}
 }
 
@@ -520,7 +540,7 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 	const int N_OF_CORES = omp_get_num_procs();
 
 	const int should_be_single_thread = (N_OF_CORES < 2 || in_count < MULTI_THREADING_THRESHOLD);
-	const int num_of_threads = should_be_single_thread ? 1 : (N_OF_CORES / 2 * 2);
+	const int num_of_threads = should_be_single_thread ? 1 : N_OF_CORES;
 
 	SRC_STATE *per_thread_state = (SRC_STATE *)malloc(num_of_threads * sizeof(SRC_STATE));
 	SRC_DATA *per_thread_data = (SRC_DATA *)malloc(num_of_threads * sizeof(SRC_DATA));

From 6d3132ad2151a98edbf14e59acead2234a779fb3 Mon Sep 17 00:00:00 2001
From: ss3git <3226388001@jcom.home.ne.jp>
Date: Thu, 16 Nov 2023 18:33:30 +0900
Subject: [PATCH 10/18] format style

---
 src/src_sinc.c | 370 +++++++++++++++++++++++++------------------------
 1 file changed, 189 insertions(+), 181 deletions(-)

diff --git a/src/src_sinc.c b/src/src_sinc.c
index 5fc8b8bf..3fb77427 100644
--- a/src/src_sinc.c
+++ b/src/src_sinc.c
@@ -229,7 +229,7 @@ sinc_get_description (int src_enum)
 	#define ALWAYS_INLINE __attribute__((always_inline)) static
 
 	#define mem_prefetch(ptr) __builtin_prefetch(ptr)
-	
+
 #else
 	#define ALWAYS_INLINE static
 
@@ -240,8 +240,8 @@ sinc_get_description (int src_enum)
 #define MULTI_THREADING_THRESHOLD (256)
 
 ALWAYS_INLINE void
-calc_output_multi_mt_core(const int enable_prefetch, const int skip_fraction, const SINC_FILTER * const filter, 
-	const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float * const output)
+calc_output_multi_mt_core(const int enable_prefetch, const int skip_fraction, const SINC_FILTER *const filter,
+						  const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output)
 {
 	double left[MAX_CHANNELS] = {0};
 	double right[MAX_CHANNELS] = {0};
@@ -252,76 +252,78 @@ calc_output_multi_mt_core(const int enable_prefetch, const int skip_fraction, co
 	const int prefetch_increment = 8;
 
 	{
-			/* First apply the left half of the filter. */
-			increment_t filter_index1 = start_filter_index;
-			const int coeff_count1 = (max_filter_index - filter_index1) / increment;
-			filter_index1 = filter_index1 + coeff_count1 * increment;
-			int data_index1 = filter->b_current - channels * coeff_count1;
+		/* First apply the left half of the filter. */
+		increment_t filter_index1 = start_filter_index;
+		const int coeff_count1 = (max_filter_index - filter_index1) / increment;
+		filter_index1 = filter_index1 + coeff_count1 * increment;
+		int data_index1 = filter->b_current - channels * coeff_count1;
 
-			if (data_index1 < 0) /* Avoid underflow access to filter->buffer. */
-			{
-				int steps = int_div_ceil(-data_index1, channels);
-				/* If the assert triggers we would have to take care not to underflow/overflow */
-				assert(steps <= int_div_ceil(filter_index1, increment));
-				filter_index1 -= increment * steps;
-				data_index1 += steps * channels;
-			}
+		if (data_index1 < 0) /* Avoid underflow access to filter->buffer. */
+		{
+			int steps = int_div_ceil(-data_index1, channels);
+			/* If the assert triggers we would have to take care not to underflow/overflow */
+			assert(steps <= int_div_ceil(filter_index1, increment));
+			filter_index1 -= increment * steps;
+			data_index1 += steps * channels;
+		}
 
-			// left = 0.0;
-			while (filter_index1 >= MAKE_INCREMENT_T(0))
+		// left = 0.0;
+		while (filter_index1 >= MAKE_INCREMENT_T(0))
+		{
+			if (enable_prefetch && filter_index1 - increment * prefetch_increment >= MAKE_INCREMENT_T(0))
 			{
-				if ( enable_prefetch && filter_index1 - increment * prefetch_increment >= MAKE_INCREMENT_T(0) ){
-					const int indx = fp_to_int(filter_index1 - increment * prefetch_increment);
-					mem_prefetch(&filter->coeffs[indx]);
-					mem_prefetch(&filter->coeffs[indx+1]);
-				}
+				const int indx = fp_to_int(filter_index1 - increment * prefetch_increment);
+				mem_prefetch(&filter->coeffs[indx]);
+				mem_prefetch(&filter->coeffs[indx + 1]);
+			}
 
-				const double fraction = fp_to_double(filter_index1);
-				const int indx = fp_to_int(filter_index1);
-				assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2);
-				const double icoeff = skip_fraction ? filter->coeffs[indx] : filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]);
-				assert(data_index1 >= 0 && data_index1 + channels - 1 < filter->b_len);
-				assert(data_index1 + channels - 1 < filter->b_end);
-				for (int ch = 0; ch < channels; ch++)
-					left[ch] += icoeff * filter->buffer[data_index1 + ch];
-
-				filter_index1 -= increment;
-				data_index1 = data_index1 + channels;
-			};
+			const double fraction = fp_to_double(filter_index1);
+			const int indx = fp_to_int(filter_index1);
+			assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2);
+			const double icoeff = skip_fraction ? filter->coeffs[indx] : filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]);
+			assert(data_index1 >= 0 && data_index1 + channels - 1 < filter->b_len);
+			assert(data_index1 + channels - 1 < filter->b_end);
+			for (int ch = 0; ch < channels; ch++)
+				left[ch] += icoeff * filter->buffer[data_index1 + ch];
+
+			filter_index1 -= increment;
+			data_index1 = data_index1 + channels;
+		};
 	}
 
 	{
-			/* Now apply the right half of the filter. */
-			increment_t filter_index2 = increment - start_filter_index;
-			const int coeff_count2 = (max_filter_index - filter_index2) / increment;
-			filter_index2 = filter_index2 + coeff_count2 * increment;
-			int data_index2 = filter->b_current + channels * (1 + coeff_count2);
-			// right = 0.0;
-
-			do {
-				if ( enable_prefetch && filter_index2 - increment * prefetch_increment > MAKE_INCREMENT_T(0) ){
-					const int indx = fp_to_int(filter_index2 - increment * prefetch_increment);
-					mem_prefetch(&filter->coeffs[indx]);
-					mem_prefetch(&filter->coeffs[indx+1]);
-				}
-
-				const double fraction = fp_to_double(filter_index2);
-				const int indx = fp_to_int(filter_index2);
-				assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2);
-				const double icoeff = skip_fraction ? filter->coeffs[indx] : filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]);
-				assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len);
-				assert(data_index2 + channels - 1 < filter->b_end);
-				for (int ch = 0; ch < channels; ch++)
-					right[ch] += icoeff * filter->buffer[data_index2 + ch];
-
-				filter_index2 -= increment;
-				data_index2 = data_index2 - channels;
-			} while (filter_index2 > MAKE_INCREMENT_T(0));
+		/* Now apply the right half of the filter. */
+		increment_t filter_index2 = increment - start_filter_index;
+		const int coeff_count2 = (max_filter_index - filter_index2) / increment;
+		filter_index2 = filter_index2 + coeff_count2 * increment;
+		int data_index2 = filter->b_current + channels * (1 + coeff_count2);
+		// right = 0.0;
+
+		do
+		{
+			if (enable_prefetch && filter_index2 - increment * prefetch_increment > MAKE_INCREMENT_T(0))
+			{
+				const int indx = fp_to_int(filter_index2 - increment * prefetch_increment);
+				mem_prefetch(&filter->coeffs[indx]);
+				mem_prefetch(&filter->coeffs[indx + 1]);
+			}
 
+			const double fraction = fp_to_double(filter_index2);
+			const int indx = fp_to_int(filter_index2);
+			assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2);
+			const double icoeff = skip_fraction ? filter->coeffs[indx] : filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]);
+			assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len);
+			assert(data_index2 + channels - 1 < filter->b_end);
+			for (int ch = 0; ch < channels; ch++)
+				right[ch] += icoeff * filter->buffer[data_index2 + ch];
+
+			filter_index2 -= increment;
+			data_index2 = data_index2 - channels;
+		} while (filter_index2 > MAKE_INCREMENT_T(0));
 	}
 
 	for (int ch = 0; ch < channels; ch++)
-			output[ch] = (float)(scale * (left[ch] + right[ch]));
+		output[ch] = (float)(scale * (left[ch] + right[ch]));
 } /* calc_output_multi_mt_core */
 
 ALWAYS_INLINE void
@@ -330,23 +332,27 @@ calc_output_multi_mt_2(
 {
 
 	const int skip_fraction = increment == ((increment >> SHIFT_BITS) << SHIFT_BITS) && start_filter_index == ((start_filter_index >> SHIFT_BITS) << SHIFT_BITS) ? 1 : 0;
-	const int enable_prefetch = (filter->coeff_half_len > ARRAY_LEN (slow_mid_qual_coeffs.coeffs));
+	const int enable_prefetch = (filter->coeff_half_len > ARRAY_LEN(slow_mid_qual_coeffs.coeffs));
 
 	if (skip_fraction)
 	{
-		if (enable_prefetch){
+		if (enable_prefetch)
+		{
 			calc_output_multi_mt_core(1, 1, filter, increment, start_filter_index, channels, scale, output);
 		}
-		else{
+		else
+		{
 			calc_output_multi_mt_core(0, 1, filter, increment, start_filter_index, channels, scale, output);
 		}
 	}
 	else
 	{
-		if (enable_prefetch){
+		if (enable_prefetch)
+		{
 			calc_output_multi_mt_core(1, 0, filter, increment, start_filter_index, channels, scale, output);
 		}
-		else{
+		else
+		{
 			calc_output_multi_mt_core(0, 0, filter, increment, start_filter_index, channels, scale, output);
 		}
 	}
@@ -355,49 +361,49 @@ calc_output_multi_mt_2(
 ALWAYS_INLINE void
 calc_output_multi_mt(const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output)
 {
-#define OPTIMIZE_LINE(x)                                                                                               \
-	case (x):                                                                                                          \
-			calc_output_multi_mt_2(filter, increment, start_filter_index, x, scale, output); \
-			break;
+#define OPTIMIZE_LINE(x)                                                                 \
+	case (x):                                                                            \
+		calc_output_multi_mt_2(filter, increment, start_filter_index, x, scale, output); \
+		break;
 
 	switch (channels) // to kick the compile-time optimizer, channel numbers up to 16 are extracted as constants here.
 	{
-			OPTIMIZE_LINE(1);
-			OPTIMIZE_LINE(2);
-			OPTIMIZE_LINE(3);
-			OPTIMIZE_LINE(4);
-			OPTIMIZE_LINE(5);
-			OPTIMIZE_LINE(6);
-			OPTIMIZE_LINE(7);
-			OPTIMIZE_LINE(8);
-			OPTIMIZE_LINE(9);
-			OPTIMIZE_LINE(10);
-			OPTIMIZE_LINE(11);
-			OPTIMIZE_LINE(12);
-			OPTIMIZE_LINE(13);
-			OPTIMIZE_LINE(14);
-			OPTIMIZE_LINE(15);
-			OPTIMIZE_LINE(16);
+		OPTIMIZE_LINE(1);
+		OPTIMIZE_LINE(2);
+		OPTIMIZE_LINE(3);
+		OPTIMIZE_LINE(4);
+		OPTIMIZE_LINE(5);
+		OPTIMIZE_LINE(6);
+		OPTIMIZE_LINE(7);
+		OPTIMIZE_LINE(8);
+		OPTIMIZE_LINE(9);
+		OPTIMIZE_LINE(10);
+		OPTIMIZE_LINE(11);
+		OPTIMIZE_LINE(12);
+		OPTIMIZE_LINE(13);
+		OPTIMIZE_LINE(14);
+		OPTIMIZE_LINE(15);
+		OPTIMIZE_LINE(16);
 	default:
-			calc_output_multi_mt_2(filter, increment, start_filter_index, channels, scale, output);
-			break;
+		calc_output_multi_mt_2(filter, increment, start_filter_index, channels, scale, output);
+		break;
 	}
 #undef OPTIMIZE_LINE
 }
 
 static SRC_ERROR
 _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no,
-	SRC_STATE * const state, SRC_DATA * const data, SRC_STATE *const main_state )
+								SRC_STATE *const state, SRC_DATA *const data, SRC_STATE *const main_state)
 {
 	if (state->private_data == NULL)
-			return SRC_ERR_NO_PRIVATE;
+		return SRC_ERR_NO_PRIVATE;
 
 	SINC_FILTER *filter = (SINC_FILTER *)state->private_data;
 	SINC_FILTER *main_filter = (SINC_FILTER *)main_state->private_data;
 
 	/* If there is not a problem, this will be optimised out. */
 	if (sizeof(filter->buffer[0]) != sizeof(data->data_in[0]))
-			return SRC_ERR_SIZE_INCOMPATIBILITY;
+		return SRC_ERR_SIZE_INCOMPATIBILITY;
 
 	const int channels = state->channels;
 	filter->in_count = data->input_frames * channels;
@@ -407,12 +413,12 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no,
 	double src_ratio = state->last_ratio;
 
 	if (is_bad_src_ratio(src_ratio))
-			return SRC_ERR_BAD_INTERNAL_STATE;
+		return SRC_ERR_BAD_INTERNAL_STATE;
 
 	/* Check the sample rate ratio wrt the buffer len. */
 	double count = (filter->coeff_half_len + 2.0) / filter->index_inc;
 	if (MIN(state->last_ratio, data->src_ratio) < 1.0)
-			count /= MIN(state->last_ratio, data->src_ratio);
+		count /= MIN(state->last_ratio, data->src_ratio);
 
 	/* Maximum coefficientson either side of center point. */
 	const int half_filter_chan_len = channels * (int)(psf_lrint(count) + 1);
@@ -436,80 +442,83 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no,
 
 	/* Main processing loop. */
 	int interleave_counter = 0;
-	float * const data_out = data->data_out;
-	
+	float *const data_out = data->data_out;
+
 	while (filter->out_gen < out_count)
 	{
-			/* Need to reload buffer? */
-			int samples_in_hand = ( filter->b_end < filter->b_current ) ? (filter->b_end - filter->b_current + filter->b_len) : (filter->b_end - filter->b_current);
+		/* Need to reload buffer? */
+		int samples_in_hand = (filter->b_end < filter->b_current) ? (filter->b_end - filter->b_current + filter->b_len) : (filter->b_end - filter->b_current);
 
-			if (samples_in_hand <= half_filter_chan_len)
+		if (samples_in_hand <= half_filter_chan_len)
+		{
+			// only one buffer is used (shared by all threads)
 			{
-				// only one buffer is used (shared by all threads)
+				#pragma omp barrier
+				#pragma omp single
+				{
+					state->error = prepare_data(filter, channels, data, half_filter_chan_len);
+
+					*main_state = *state;
+					*main_filter = *filter;
+				}
+				#pragma omp barrier					
 				{
-					#pragma omp barrier
-					#pragma omp single
-					{
-						state->error = prepare_data(filter, channels, data, half_filter_chan_len);
-						
-						*main_state = *state;
-						*main_filter = *filter;
-					}
-					#pragma omp barrier					
-					{
-						*state = *main_state;
-						*filter = *main_filter;
-					}
+					*state = *main_state;
+					*filter = *main_filter;
 				}
+			}
 
-				if (state->error != 0)
-					return state->error;
+			if (state->error != 0)
+				return state->error;
 
-				samples_in_hand = ( filter->b_end < filter->b_current ) ? (filter->b_end - filter->b_current + filter->b_len) : (filter->b_end - filter->b_current);
-				if (samples_in_hand <= half_filter_chan_len)
-					break;
-			};
+			samples_in_hand = (filter->b_end < filter->b_current) ? (filter->b_end - filter->b_current + filter->b_len) : (filter->b_end - filter->b_current);
+			if (samples_in_hand <= half_filter_chan_len)
+				break;
+		};
 
-			/* This is the termination condition. */
-			if (filter->b_real_end >= 0)
-			{
-				if (filter->b_current + input_index + terminate > filter->b_real_end)
-					break;
-			};
+		/* This is the termination condition. */
+		if (filter->b_real_end >= 0)
+		{
+			if (filter->b_current + input_index + terminate > filter->b_real_end)
+				break;
+		};
 
-			double scale, float_increment;
-			increment_t increment;
-			if (!is_constant_ratio)
-			{
-				if (out_count > 0 && fabs(state->last_ratio - data->src_ratio) > 1e-10)
-					src_ratio = state->last_ratio + filter->out_gen * (data->src_ratio - state->last_ratio) / out_count;
+		double scale, float_increment;
+		increment_t increment;
+		if (!is_constant_ratio)
+		{
+			if (out_count > 0 && fabs(state->last_ratio - data->src_ratio) > 1e-10)
+				src_ratio = state->last_ratio + filter->out_gen * (data->src_ratio - state->last_ratio) / out_count;
 
-				float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0);
-				increment = double_to_fp(float_increment);
-				scale = float_increment / index_inc;
-			}
-			else
-			{
-				float_increment = constant_float_increment;
-				increment = constant_increment;
-				scale = constant_scale;
-			}
+			float_increment = index_inc * (src_ratio < 1.0 ? src_ratio : 1.0);
+			increment = double_to_fp(float_increment);
+			scale = float_increment / index_inc;
+		}
+		else
+		{
+			float_increment = constant_float_increment;
+			increment = constant_increment;
+			scale = constant_scale;
+		}
 
-			increment_t start_filter_index = double_to_fp(input_index * float_increment);
+		increment_t start_filter_index = double_to_fp(input_index * float_increment);
 
-			if ( child_no == interleave_counter ){
-				calc_output_multi_mt(filter, increment, start_filter_index, channels, scale, data_out + filter->out_gen);
-			}
-			if ( ++interleave_counter == num_of_threads ) interleave_counter = 0;
-			filter->out_gen += channels;
+		if (child_no == interleave_counter)
+		{
+			calc_output_multi_mt(filter, increment, start_filter_index, channels, scale, data_out + filter->out_gen);
+		}
+		if (++interleave_counter == num_of_threads)
+			interleave_counter = 0;
+		filter->out_gen += channels;
 
-			/* Figure out the next index. */
-			input_index += (is_constant_ratio) ? constant_input_index_inc : 1.0 / src_ratio;
-			rem = fmod_one(input_index);
+		/* Figure out the next index. */
+		input_index += (is_constant_ratio) ? constant_input_index_inc : 1.0 / src_ratio;
+		rem = fmod_one(input_index);
 
-			filter->b_current = (filter->b_current + channels * psf_lrint(input_index - rem));
-			if ( filter->b_current >= filter->b_len ) filter->b_current -= filter->b_len;
-			input_index = rem;
+		filter->b_current = (filter->b_current + channels * psf_lrint(input_index - rem));
+		if (filter->b_current >= filter->b_len)
+			filter->b_current -= filter->b_len;
+		input_index = rem;
 	};
 
 	state->last_position = input_index;
@@ -527,7 +536,7 @@ static SRC_ERROR
 sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 {
 	if (state->private_data == NULL)
-			return SRC_ERR_NO_PRIVATE;
+		return SRC_ERR_NO_PRIVATE;
 
 	const int channels = state->channels;
 
@@ -549,55 +558,54 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 
 	SRC_ERROR retval = SRC_ERR_MALLOC_FAILED;
 
-	if ( !per_thread_state || !per_thread_data || !per_thread_filter
-		 || !per_thread_retval )
+	if (!per_thread_state || !per_thread_data || !per_thread_filter || !per_thread_retval)
 	{
-			goto cleanup_and_return;
+		goto cleanup_and_return;
 	}
 
 	// OpenMP
 	omp_set_dynamic(0);
 	omp_set_num_threads(num_of_threads);
 
-	assert( num_of_threads == omp_get_num_threads() );
+	assert(num_of_threads == omp_get_num_threads());
 
 	if (num_of_threads == 1) // w/o OpenMP
 	{
-			per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, state, data, state);
+		per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, state, data, state);
 
-			retval = per_thread_retval[0];
+		retval = per_thread_retval[0];
 
-			goto cleanup_and_return;
+		goto cleanup_and_return;
 	}
 
 	int omp_child_no;
 
-#pragma omp parallel for
+	#pragma omp parallel for
 	for (omp_child_no = 0; omp_child_no < num_of_threads; omp_child_no++)
 	{
-			const int child_no = omp_child_no;
+		const int child_no = omp_child_no;
 
-			memcpy(&per_thread_data[child_no], data, sizeof(SRC_DATA));
-			memcpy(&per_thread_filter[child_no], filter, sizeof(SINC_FILTER));
+		memcpy(&per_thread_data[child_no], data, sizeof(SRC_DATA));
+		memcpy(&per_thread_filter[child_no], filter, sizeof(SINC_FILTER));
 
-			memcpy(&per_thread_state[child_no], state, sizeof(SRC_STATE));
-			per_thread_state[child_no].private_data = &per_thread_filter[child_no];
+		memcpy(&per_thread_state[child_no], state, sizeof(SRC_STATE));
+		per_thread_state[child_no].private_data = &per_thread_filter[child_no];
 
-			per_thread_filter[child_no].buffer = filter->buffer;
+		per_thread_filter[child_no].buffer = filter->buffer;
 
-			per_thread_retval[child_no] = _sinc_multichan_vari_process_mt(
-				num_of_threads, child_no, 
-				&per_thread_state[child_no], &per_thread_data[child_no], state);
+		per_thread_retval[child_no] = _sinc_multichan_vari_process_mt(
+			num_of_threads, child_no,
+			&per_thread_state[child_no], &per_thread_data[child_no], state);
 	}
 
 	// error checking for each worker
 	for (int child_no = 0; child_no < num_of_threads; child_no++)
 	{
-			if (per_thread_retval[child_no] != SRC_ERR_NO_ERROR)
-			{
-				retval = per_thread_retval[child_no];
-				goto cleanup_and_return;
-			}
+		if (per_thread_retval[child_no] != SRC_ERR_NO_ERROR)
+		{
+			retval = per_thread_retval[child_no];
+			goto cleanup_and_return;
+		}
 	}
 
 	// update filter status
@@ -607,7 +615,7 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 
 	memcpy(state, &per_thread_state[0], sizeof(SRC_STATE));
 	state->private_data = filter;
-	
+
 	memcpy(data, &per_thread_data[0], sizeof(SRC_DATA));
 
 	retval = SRC_ERR_NO_ERROR;
@@ -615,16 +623,16 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 cleanup_and_return:
 
 	if (per_thread_state)
-			free(per_thread_state);
+		free(per_thread_state);
 
 	if (per_thread_data)
-			free(per_thread_data);
+		free(per_thread_data);
 
 	if (per_thread_filter)
-			free(per_thread_filter);
+		free(per_thread_filter);
 
 	if (per_thread_retval)
-			free(per_thread_retval);
+		free(per_thread_retval);
 
 	return retval;
 }

From 99d439649e4696ba61e49224d30c29ccd17b6b78 Mon Sep 17 00:00:00 2001
From: ss3git <3226388001@jcom.home.ne.jp>
Date: Fri, 17 Nov 2023 02:43:46 +0900
Subject: [PATCH 11/18] fix for debug build

---
 src/src_sinc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/src_sinc.c b/src/src_sinc.c
index 3fb77427..9c93f024 100644
--- a/src/src_sinc.c
+++ b/src/src_sinc.c
@@ -567,9 +567,9 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 	omp_set_dynamic(0);
 	omp_set_num_threads(num_of_threads);
 
-	assert(num_of_threads == omp_get_num_threads());
+	//assert(num_of_threads == omp_get_max_threads());
 
-	if (num_of_threads == 1) // w/o OpenMP
+	if (num_of_threads == 1 || omp_get_max_threads() == 1 ) // w/o OpenMP
 	{
 		per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, state, data, state);
 

From 7348582e57dcf5c0ee65ff104f413893648489a8 Mon Sep 17 00:00:00 2001
From: ss3git <3226388001@jcom.home.ne.jp>
Date: Fri, 17 Nov 2023 22:13:33 +0900
Subject: [PATCH 12/18] change WIN32 to MSVC for openmp build option switch

---
 src/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 21ce661c..02306312 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -44,7 +44,7 @@ add_library(samplerate
 add_library(SampleRate::samplerate ALIAS samplerate)
 
 if(MULTI_THREADING)
-  if(WIN32)
+  if(MSVC)
     target_link_libraries(samplerate PRIVATE libomp)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /openmp:llvm")
   else()

From bd6aec33ee6659f32b76ca365de6a4123580478a Mon Sep 17 00:00:00 2001
From: ss3git <3226388001@jcom.home.ne.jp>
Date: Fri, 17 Nov 2023 22:25:56 +0900
Subject: [PATCH 13/18] cache once-calculated coeffs if src_ratio is integer

---
 src/src_sinc.c                    | 275 ++++++++++++++++++++++++++----
 tests/multichan_throughput_test.c |  61 ++++---
 2 files changed, 273 insertions(+), 63 deletions(-)

diff --git a/src/src_sinc.c b/src/src_sinc.c
index 9c93f024..225570d1 100644
--- a/src/src_sinc.c
+++ b/src/src_sinc.c
@@ -229,7 +229,7 @@ sinc_get_description (int src_enum)
 	#define ALWAYS_INLINE __attribute__((always_inline)) static
 
 	#define mem_prefetch(ptr) __builtin_prefetch(ptr)
-
+	
 #else
 	#define ALWAYS_INLINE static
 
@@ -239,8 +239,32 @@ sinc_get_description (int src_enum)
 /* smaller frames are processed in single thread to avoid overheads */
 #define MULTI_THREADING_THRESHOLD (256)
 
+#define MT_COEFFS_CACHING 1
+
+enum MT_CACHE_MODE
+{
+	MT_CACHE_NONE,
+	MT_CACHE_READ,
+	MT_CACHE_WRITE
+};
+
+typedef struct mt_cache_t
+{
+	enum MT_CACHE_MODE cache_state;
+	increment_t start_filter_index;
+	double *coeffs;
+} mt_cache_t;
+
+typedef struct mt_cache_array_t
+{
+	int len;
+	int len2;
+	mt_cache_t *caches;
+} mt_cache_array_t;
+
 ALWAYS_INLINE void
-calc_output_multi_mt_core(const int enable_prefetch, const int skip_fraction, const SINC_FILTER *const filter,
+calc_output_multi_mt_core(const enum MT_CACHE_MODE use_cache, mt_cache_t *const cache, const int cache_len,
+						  const int enable_prefetch, const int skip_fraction, const SINC_FILTER *const filter,
 						  const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output)
 {
 	double left[MAX_CHANNELS] = {0};
@@ -251,6 +275,8 @@ calc_output_multi_mt_core(const int enable_prefetch, const int skip_fraction, co
 
 	const int prefetch_increment = 8;
 
+	int cache_idx = 0;
+
 	{
 		/* First apply the left half of the filter. */
 		increment_t filter_index1 = start_filter_index;
@@ -270,19 +296,42 @@ calc_output_multi_mt_core(const int enable_prefetch, const int skip_fraction, co
 		// left = 0.0;
 		while (filter_index1 >= MAKE_INCREMENT_T(0))
 		{
-			if (enable_prefetch && filter_index1 - increment * prefetch_increment >= MAKE_INCREMENT_T(0))
+			double coeff;
+
+			if (use_cache == MT_CACHE_READ)
+			{
+				coeff = cache->coeffs[cache_idx++];
+
+				assert(cache_idx <= cache_len);
+			}
+			else
 			{
-				const int indx = fp_to_int(filter_index1 - increment * prefetch_increment);
-				mem_prefetch(&filter->coeffs[indx]);
-				mem_prefetch(&filter->coeffs[indx + 1]);
+				if (enable_prefetch && filter_index1 - increment * prefetch_increment >= MAKE_INCREMENT_T(0))
+				{
+					const int indx = fp_to_int(filter_index1 - increment * prefetch_increment);
+					mem_prefetch(&filter->coeffs[indx]);
+					mem_prefetch(&filter->coeffs[indx + 1]);
+				}
+
+				const double fraction = fp_to_double(filter_index1);
+				const int indx = fp_to_int(filter_index1);
+				assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2);
+				const coeff_t coeff_val = filter->coeffs[indx];
+				const coeff_t coeff_fraction = filter->coeffs[indx + 1] - filter->coeffs[indx];
+				coeff = skip_fraction ? coeff_val : coeff_val + fraction * coeff_fraction;
+				assert(data_index1 >= 0 && data_index1 + channels - 1 < filter->b_len);
+				assert(data_index1 + channels - 1 < filter->b_end);
+
+				if (use_cache == MT_CACHE_WRITE)
+				{
+					coeff = cache->coeffs[cache_idx++] = coeff;
+
+					assert(cache_idx <= cache_len);
+				}
 			}
 
-			const double fraction = fp_to_double(filter_index1);
-			const int indx = fp_to_int(filter_index1);
-			assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2);
-			const double icoeff = skip_fraction ? filter->coeffs[indx] : filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]);
-			assert(data_index1 >= 0 && data_index1 + channels - 1 < filter->b_len);
-			assert(data_index1 + channels - 1 < filter->b_end);
+			const double icoeff = coeff;
+
 			for (int ch = 0; ch < channels; ch++)
 				left[ch] += icoeff * filter->buffer[data_index1 + ch];
 
@@ -301,34 +350,65 @@ calc_output_multi_mt_core(const int enable_prefetch, const int skip_fraction, co
 
 		do
 		{
-			if (enable_prefetch && filter_index2 - increment * prefetch_increment > MAKE_INCREMENT_T(0))
+			double coeff;
+
+			if (use_cache == MT_CACHE_READ)
+			{
+				coeff = cache->coeffs[cache_idx++];
+
+				assert(cache_idx <= cache_len);
+			}
+			else
 			{
-				const int indx = fp_to_int(filter_index2 - increment * prefetch_increment);
-				mem_prefetch(&filter->coeffs[indx]);
-				mem_prefetch(&filter->coeffs[indx + 1]);
+				if (enable_prefetch && filter_index2 - increment * prefetch_increment > MAKE_INCREMENT_T(0))
+				{
+					const int indx = fp_to_int(filter_index2 - increment * prefetch_increment);
+					mem_prefetch(&filter->coeffs[indx]);
+					mem_prefetch(&filter->coeffs[indx + 1]);
+				}
+
+				const double fraction = fp_to_double(filter_index2);
+				const int indx = fp_to_int(filter_index2);
+				assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2);
+				const coeff_t coeff_val = filter->coeffs[indx];
+				const coeff_t coeff_fraction = (float)filter->coeffs[indx + 1] - (float)filter->coeffs[indx];
+				coeff = skip_fraction ? coeff_val : coeff_val + fraction * coeff_fraction;
+				assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len);
+				assert(data_index2 + channels - 1 < filter->b_end);
+
+				if (use_cache == MT_CACHE_WRITE)
+				{
+					coeff = cache->coeffs[cache_idx++] = coeff;
+
+					assert(cache_idx <= cache_len);
+				}
 			}
 
-			const double fraction = fp_to_double(filter_index2);
-			const int indx = fp_to_int(filter_index2);
-			assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2);
-			const double icoeff = skip_fraction ? filter->coeffs[indx] : filter->coeffs[indx] + fraction * (filter->coeffs[indx + 1] - filter->coeffs[indx]);
-			assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len);
-			assert(data_index2 + channels - 1 < filter->b_end);
+			const double icoeff = coeff;
+
 			for (int ch = 0; ch < channels; ch++)
 				right[ch] += icoeff * filter->buffer[data_index2 + ch];
 
+			const double c_coeff = coeff;
 			filter_index2 -= increment;
 			data_index2 = data_index2 - channels;
+
 		} while (filter_index2 > MAKE_INCREMENT_T(0));
 	}
 
+	if (use_cache == MT_CACHE_WRITE)
+	{
+		cache->start_filter_index = start_filter_index;
+		cache->cache_state = MT_CACHE_READ;
+	}
+
 	for (int ch = 0; ch < channels; ch++)
 		output[ch] = (float)(scale * (left[ch] + right[ch]));
 } /* calc_output_multi_mt_core */
 
 ALWAYS_INLINE void
-calc_output_multi_mt_2(
-	const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output)
+calc_output_multi_mt_3(const int use_cache, mt_cache_t *const cache, const int cache_len, const SINC_FILTER *const filter,
+					   const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output)
 {
 
 	const int skip_fraction = increment == ((increment >> SHIFT_BITS) << SHIFT_BITS) && start_filter_index == ((start_filter_index >> SHIFT_BITS) << SHIFT_BITS) ? 1 : 0;
@@ -338,32 +418,81 @@ calc_output_multi_mt_2(
 	{
 		if (enable_prefetch)
 		{
-			calc_output_multi_mt_core(1, 1, filter, increment, start_filter_index, channels, scale, output);
+			calc_output_multi_mt_core(use_cache, cache, cache_len, 1, 1, filter, increment, start_filter_index, channels, scale, output);
 		}
 		else
 		{
-			calc_output_multi_mt_core(0, 1, filter, increment, start_filter_index, channels, scale, output);
+			calc_output_multi_mt_core(use_cache, cache, cache_len, 0, 1, filter, increment, start_filter_index, channels, scale, output);
 		}
 	}
 	else
 	{
 		if (enable_prefetch)
 		{
-			calc_output_multi_mt_core(1, 0, filter, increment, start_filter_index, channels, scale, output);
+			calc_output_multi_mt_core(use_cache, cache, cache_len, 1, 0, filter, increment, start_filter_index, channels, scale, output);
 		}
 		else
 		{
-			calc_output_multi_mt_core(0, 0, filter, increment, start_filter_index, channels, scale, output);
+			calc_output_multi_mt_core(use_cache, cache, cache_len, 0, 0, filter, increment, start_filter_index, channels, scale, output);
 		}
 	}
 }
 
 ALWAYS_INLINE void
-calc_output_multi_mt(const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output)
+calc_output_multi_mt_2(mt_cache_array_t *cache_array, const SINC_FILTER *const filter,
+					   const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output)
 {
-#define OPTIMIZE_LINE(x)                                                                 \
-	case (x):                                                                            \
-		calc_output_multi_mt_2(filter, increment, start_filter_index, x, scale, output); \
+	const int cache_len = cache_array->len2;
+	const int idx = cache_array->len ? (int)(start_filter_index / (increment / (cache_array->len - 1))) : 0;
+
+	mt_cache_t *cache = (cache_array->len && idx < cache_array->len) ? &cache_array->caches[idx] : NULL;
+
+	enum MT_CACHE_MODE use_cache = MT_CACHE_NONE;
+
+	if (cache)
+	{
+		enum MT_CACHE_MODE cache_state = cache->cache_state;
+
+		if (cache_state == MT_CACHE_READ)
+		{
+			if (start_filter_index == cache->start_filter_index)
+			{
+				use_cache = MT_CACHE_READ;
+			}
+			// else {
+			//	assert(0);	// not expected to come here, but not harmful, so commenting out
+			//	exit(0);
+			// }
+		}
+		else if (cache_state == MT_CACHE_NONE)
+		{
+			cache->cache_state = MT_CACHE_WRITE;
+			use_cache = MT_CACHE_WRITE;
+		}
+	}
+
+	if (use_cache == MT_CACHE_READ)
+	{
+		// skip to core, since skip_fraction/enable_prefetch will not affect
+		calc_output_multi_mt_core(MT_CACHE_READ, cache, cache_len, 0, 0, filter, increment, start_filter_index, channels, scale, output);
+	}
+	else if (use_cache == MT_CACHE_WRITE)
+	{
+		calc_output_multi_mt_3(MT_CACHE_WRITE, cache, cache_len, filter, increment, start_filter_index, channels, scale, output);
+	}
+	else
+	{
+		calc_output_multi_mt_3(MT_CACHE_NONE, cache, cache_len, filter, increment, start_filter_index, channels, scale, output);
+	}
+}
+
+ALWAYS_INLINE void
+calc_output_multi_mt(mt_cache_array_t *cache_array,
+					 const SINC_FILTER *const filter, const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output)
+{
+#define OPTIMIZE_LINE(x)                                                                              \
+	case (x):                                                                                         \
+		calc_output_multi_mt_2(cache_array, filter, increment, start_filter_index, x, scale, output); \
 		break;
 
 	switch (channels) // to kick the compile-time optimizer, channel numbers up to 16 are extracted as constants here.
@@ -385,7 +514,7 @@ calc_output_multi_mt(const SINC_FILTER *const filter, const increment_t incremen
 		OPTIMIZE_LINE(15);
 		OPTIMIZE_LINE(16);
 	default:
-		calc_output_multi_mt_2(filter, increment, start_filter_index, channels, scale, output);
+		calc_output_multi_mt_2(cache_array, filter, increment, start_filter_index, channels, scale, output);
 		break;
 	}
 #undef OPTIMIZE_LINE
@@ -444,6 +573,56 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no,
 	int interleave_counter = 0;
 	float *const data_out = data->data_out;
 
+	mt_cache_array_t _cache_array = {0};
+	mt_cache_array_t *const cache_array = &_cache_array;
+
+#if MT_COEFFS_CACHING
+	// Caching once-calculated (interpolated) coeffs in memory is reasonable if the src_ratio is an integer
+	// because only limited number of coeffs are cyclically used in those cases.
+	// Drawback is that the processing speed can fluctuate if the condition (src_ratio) changes.
+
+	if (is_constant_ratio && src_ratio == (int)src_ratio)
+	{
+
+		do
+		{
+			int len = (int)src_ratio + 1;
+			int len2 = 0;
+			cache_array->caches = (mt_cache_t *)calloc(len, sizeof(mt_cache_t));
+			if (!cache_array->caches)
+			{
+				break;
+			}
+
+			for (int i = 0; i < len; i++)
+			{
+				len2 = ((filter->coeff_half_len + 2) / filter->index_inc + filter->index_inc);
+				cache_array->caches[i].coeffs = (double *)calloc(len2, sizeof(double));
+				if (!cache_array->caches[i].coeffs)
+				{
+					for (int j = 0; j < i; j++)
+					{
+						free(cache_array->caches[j].coeffs);
+						cache_array->caches[j].coeffs = NULL;
+					}
+					free(cache_array->caches);
+					cache_array->caches = NULL;
+					break;
+				}
+			}
+
+			if (cache_array->caches)
+			{
+				cache_array->len = len;
+				cache_array->len2 = len2;
+			}
+
+		} while (0);
+	}
+#endif
+
+	int rtn = SRC_ERR_NO_ERROR;
+
 	while (filter->out_gen < out_count)
 	{
 		/* Need to reload buffer? */
@@ -469,7 +648,10 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no,
 			}
 
 			if (state->error != 0)
-				return state->error;
+			{
+				rtn = state->error;
+				break;
+			}
 
 			samples_in_hand = (filter->b_end < filter->b_current) ? (filter->b_end - filter->b_current + filter->b_len) : (filter->b_end - filter->b_current);
 			if (samples_in_hand <= half_filter_chan_len)
@@ -505,7 +687,7 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no,
 
 		if (child_no == interleave_counter)
 		{
-			calc_output_multi_mt(filter, increment, start_filter_index, channels, scale, data_out + filter->out_gen);
+			calc_output_multi_mt(cache_array, filter, increment, start_filter_index, channels, scale, data_out + filter->out_gen);
 		}
 		if (++interleave_counter == num_of_threads)
 			interleave_counter = 0;
@@ -521,6 +703,25 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no,
 		input_index = rem;
 	};
 
+#if MT_COEFFS_CACHING
+	{
+		if (cache_array->len)
+		{
+			for (int i = 0; i < cache_array->len; i++)
+			{
+				free(cache_array->caches[i].coeffs);
+				cache_array->caches[i].coeffs = NULL;
+			}
+			free(cache_array->caches);
+			cache_array->caches = NULL;
+			cache_array->len = 0;
+		}
+	}
+#endif
+
+	if (rtn)
+		return rtn;
+
 	state->last_position = input_index;
 
 	/* Save current ratio rather then target ratio. */
@@ -567,9 +768,9 @@ sinc_multithread_vari_process(SRC_STATE *state, SRC_DATA *data)
 	omp_set_dynamic(0);
 	omp_set_num_threads(num_of_threads);
 
-	//assert(num_of_threads == omp_get_max_threads());
+	// assert(num_of_threads == omp_get_max_threads());
 
-	if (num_of_threads == 1 || omp_get_max_threads() == 1 ) // w/o OpenMP
+	if (num_of_threads == 1 || omp_get_max_threads() == 1) // w/o OpenMP
 	{
 		per_thread_retval[0] = _sinc_multichan_vari_process_mt(1, 0, state, data, state);
 
diff --git a/tests/multichan_throughput_test.c b/tests/multichan_throughput_test.c
index 7c88852b..b0aa2ea5 100644
--- a/tests/multichan_throughput_test.c
+++ b/tests/multichan_throughput_test.c
@@ -36,10 +36,10 @@ static float input [BUFFER_LEN] ;
 
 #if (defined(ENABLE_SINC_FAST_CONVERTER) || defined(ENABLE_SINC_MEDIUM_CONVERTER) || \
 	defined(ENABLE_SINC_BEST_CONVERTER))
-static float output [BUFFER_LEN] ;
+static float output [BUFFER_LEN*2] ;
 
 static void
-throughput_test (int converter, int channels, long *best_throughput)
+throughput_test (int converter, int channels, long *best_throughput, double src_ratio)
 {	SRC_DATA src_data ;
 #if !defined(_WIN32) && defined(MULTI_THREADING)
 	struct timespec start_gettime, finish_gettime;
@@ -50,7 +50,7 @@ throughput_test (int converter, int channels, long *best_throughput)
 	long total_frames = 0, throughput ;
 	int error ;
 
-	printf ("    %-30s     %2d         ", src_get_name (converter), channels) ;
+	printf ("    %-30s   %2d     ", src_get_name (converter), channels) ;
 	fflush (stdout) ;
 
 	src_data.data_in = input ;
@@ -59,7 +59,7 @@ throughput_test (int converter, int channels, long *best_throughput)
 	src_data.data_out = output ;
 	src_data.output_frames = ARRAY_LEN (output) / channels ;
 
-	src_data.src_ratio = 0.99 ;
+	src_data.src_ratio = src_ratio ;
 
 #ifdef _WIN32
 	Sleep (2000) ;
@@ -94,23 +94,25 @@ throughput_test (int converter, int channels, long *best_throughput)
 	}
 	while (duration < 5.0) ;
 
-	if (src_data.input_frames_used != src_data.input_frames)
-	{	printf ("\n\nLine %d : input frames used %ld should be %ld\n", __LINE__, src_data.input_frames_used, src_data.input_frames) ;
-		exit (1) ;
-		} ;
-
-	if (fabs (src_data.src_ratio * src_data.input_frames_used - src_data.output_frames_gen) > 2)
-	{	printf ("\n\nLine %d : input / output length mismatch.\n\n", __LINE__) ;
-		printf ("    input len  : %d\n", ARRAY_LEN (input) / channels) ;
-		printf ("    output len : %ld (should be %g +/- 2)\n\n", src_data.output_frames_gen,
-				floor (0.5 + src_data.src_ratio * src_data.input_frames_used)) ;
-		exit (1) ;
-		} ;
+	if ( src_ratio <= 1.0 ){
+		if (src_data.input_frames_used != src_data.input_frames)
+		{	printf ("\n\nLine %d : input frames used %ld should be %ld\n", __LINE__, src_data.input_frames_used, src_data.input_frames) ;
+			exit (1) ;
+			} ;
+	
+		if (fabs (src_data.src_ratio * src_data.input_frames_used - src_data.output_frames_gen) > 2)
+		{	printf ("\n\nLine %d : input / output length mismatch.\n\n", __LINE__) ;
+			printf ("    input len  : %d\n", ARRAY_LEN (input) / channels) ;
+			printf ("    output len : %ld (should be %g +/- 2)\n\n", src_data.output_frames_gen,
+					floor (0.5 + src_data.src_ratio * src_data.input_frames_used)) ;
+			exit (1) ;
+			} ;
+	}
 
 	throughput = lrint (floor (total_frames / duration)) ;
 
 	if (!best_throughput)
-	{	printf ("%5.2f      %10ld\n", duration, throughput) ;
+	{	printf ("%5.2f      %10ld (x%7.2f)\n", duration, throughput, (throughput/src_ratio/44100)) ;
 		}
 	else
 	{	*best_throughput = MAX (throughput, *best_throughput) ;
@@ -131,31 +133,38 @@ single_run (void)
 
 	printf ("\n    CPU name : %s\n", get_cpu_name ()) ;
 
+	double src_ratio[] = {0.99, 2.0, 7.0, 0.25};
+
+for( int i=0 ; i<sizeof(src_ratio)/sizeof(double) ; i++ ){
+	printf ("\n    SRC_RATIO : %.2lf\n", src_ratio[i]) ;
+	
 	puts (
 		"\n"
-		"    Converter                        Channels    Duration      Throughput\n"
-		"    ---------------------------------------------------------------------"
+		"    Converter                        Ch    Duration  Throughput (times faster than realtime if 44.1k in)\n"
+		"    -------------------------------------------------------------------------"
 		) ;
 
 #ifdef ENABLE_SINC_FAST_CONVERTER
 	for (k = 1 ; k <= max_channels / 2 ; k++)
-		throughput_test (SRC_SINC_FASTEST, k, 0) ;
+		throughput_test (SRC_SINC_FASTEST, k, 0, src_ratio[i]) ;
 
 	puts ("") ;
 #endif
 
 #ifdef ENABLE_SINC_MEDIUM_CONVERTER
 	for (k = 1 ; k <= max_channels / 2 ; k++)
-		throughput_test (SRC_SINC_MEDIUM_QUALITY, k, 0) ;
+		throughput_test (SRC_SINC_MEDIUM_QUALITY, k, 0, src_ratio[i]) ;
 
 	puts ("") ;
 #endif
 
 #ifdef ENABLE_SINC_BEST_CONVERTER
 	for (k = 1 ; k <= max_channels ; k++)
-		throughput_test (SRC_SINC_BEST_QUALITY, k, 0) ;
+		throughput_test (SRC_SINC_BEST_QUALITY, k, 0, src_ratio[i]) ;
 	puts ("") ;
 #endif
+
+}
 	return ;
 } /* single_run */
 
@@ -167,7 +176,7 @@ multi_run (int run_count)
 
 	puts (
 		"\n"
-		"    Converter                        Channels    Duration      Throughput    Best Throughput\n"
+		"    Converter                        Ch    Duration      Throughput    Best Throughput\n"
 		"    ----------------------------------------------------------------------------------------"
 		) ;
 
@@ -187,13 +196,13 @@ multi_run (int run_count)
 		for (int k = 0 ; k < run_count ; k++)
 		{
 #ifdef ENABLE_SINC_FAST_CONVERTER
-			throughput_test (SRC_SINC_FASTEST, ch, &sinc_fastest) ;
+			throughput_test (SRC_SINC_FASTEST, ch, &sinc_fastest, 0.99) ;
 #endif
 #ifdef ENABLE_SINC_MEDIUM_CONVERTER
-			throughput_test (SRC_SINC_MEDIUM_QUALITY, ch, &sinc_medium) ;
+			throughput_test (SRC_SINC_MEDIUM_QUALITY, ch, &sinc_medium, 0.99) ;
 #endif
 #ifdef ENABLE_SINC_BEST_CONVERTER
-			throughput_test (SRC_SINC_BEST_QUALITY, ch, &sinc_best) ;
+			throughput_test (SRC_SINC_BEST_QUALITY, ch, &sinc_best, 0.99) ;
 #endif
 
 			puts ("") ;

From 0738d4d40cdd48deed7ad9f97a4a6fbf613d5d12 Mon Sep 17 00:00:00 2001
From: ss3git <3226388001@jcom.home.ne.jp>
Date: Fri, 17 Nov 2023 23:07:21 +0900
Subject: [PATCH 14/18] fix type and clarify zero

---
 src/src_sinc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/src_sinc.c b/src/src_sinc.c
index 225570d1..9db3ace9 100644
--- a/src/src_sinc.c
+++ b/src/src_sinc.c
@@ -243,7 +243,7 @@ sinc_get_description (int src_enum)
 
 enum MT_CACHE_MODE
 {
-	MT_CACHE_NONE,
+	MT_CACHE_NONE = 0,
 	MT_CACHE_READ,
 	MT_CACHE_WRITE
 };
@@ -407,7 +407,7 @@ calc_output_multi_mt_core(const enum MT_CACHE_MODE use_cache, mt_cache_t *const
 } /* calc_output_multi_mt_core */
 
 ALWAYS_INLINE void
-calc_output_multi_mt_3(const int use_cache, mt_cache_t *const cache, const int cache_len, const SINC_FILTER *const filter,
+calc_output_multi_mt_3(const enum MT_CACHE_MODE use_cache, mt_cache_t *const cache, const int cache_len, const SINC_FILTER *const filter,
 					   const increment_t increment, const increment_t start_filter_index, const int channels, const double scale, float *const output)
 {
 

From 1c01a777eee1501882fa343989010fc540d35501 Mon Sep 17 00:00:00 2001
From: ss3git <3226388001@jcom.home.ne.jp>
Date: Sat, 18 Nov 2023 00:01:49 +0900
Subject: [PATCH 15/18] unnecessary (wrongly merged) cast removal

---
 src/src_sinc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/src_sinc.c b/src/src_sinc.c
index 9db3ace9..ba72029d 100644
--- a/src/src_sinc.c
+++ b/src/src_sinc.c
@@ -371,7 +371,7 @@ calc_output_multi_mt_core(const enum MT_CACHE_MODE use_cache, mt_cache_t *const
 				const int indx = fp_to_int(filter_index2);
 				assert(indx >= 0 && indx + 1 < filter->coeff_half_len + 2);
 				const coeff_t coeff_val = filter->coeffs[indx];
-				const coeff_t coeff_fraction = (float)filter->coeffs[indx + 1] - (float)filter->coeffs[indx];
+				const coeff_t coeff_fraction = filter->coeffs[indx + 1] - filter->coeffs[indx];
 				coeff = skip_fraction ? coeff_val : coeff_val + fraction * coeff_fraction;
 				assert(data_index2 >= 0 && data_index2 + channels - 1 < filter->b_len);
 				assert(data_index2 + channels - 1 < filter->b_end);

From 6d4abeb1839934458105afd7ad93ac0b03ca588d Mon Sep 17 00:00:00 2001
From: ss3git <3226388001@jcom.home.ne.jp>
Date: Sun, 19 Nov 2023 02:25:48 +0900
Subject: [PATCH 16/18] change _WIN32 to _MSC_VER

---
 src/src_sinc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/src_sinc.c b/src/src_sinc.c
index ba72029d..9b236bd1 100644
--- a/src/src_sinc.c
+++ b/src/src_sinc.c
@@ -219,7 +219,7 @@ sinc_get_description (int src_enum)
 
 #include <omp.h>
 
-#ifdef _WIN32
+#ifdef _MSC_VER
 	#define ALWAYS_INLINE __forceinline
 	#include <xmmintrin.h>
 

From c0aa8796a876a662c7791dde6a8b7f3beece9cb0 Mon Sep 17 00:00:00 2001
From: ss3git <3226388001@jcom.home.ne.jp>
Date: Sat, 16 Dec 2023 19:01:20 +0900
Subject: [PATCH 17/18] loop termination condition fix

---
 src/src_sinc.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/src_sinc.c b/src/src_sinc.c
index 9b236bd1..f83576b9 100644
--- a/src/src_sinc.c
+++ b/src/src_sinc.c
@@ -661,8 +661,22 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no,
 		/* This is the termination condition. */
 		if (filter->b_real_end >= 0)
 		{
-			if (filter->b_current + input_index + terminate > filter->b_real_end)
+			#if 1 // perhaps this is the intended termination condition.
+			if (filter->b_current + input_index + terminate + channels - 1 > filter->b_real_end)
 				break;
+
+			#else // this one matches to the current (0.22) single-thread implementation, but it is seemingly a bug.
+			if (channels == 1)
+			{
+				if (filter->b_current + input_index + terminate > filter->b_real_end)
+					break;
+			}
+			else
+			{
+				if (filter->b_current + input_index + terminate >= filter->b_real_end)
+					break;
+			}
+			#endif
 		};
 
 		double scale, float_increment;

From d509240e6e992594719f8bb8e772dd2dae9c9ae4 Mon Sep 17 00:00:00 2001
From: ss3git <3226388001@jcom.home.ne.jp>
Date: Sat, 16 Dec 2023 22:21:25 +0900
Subject: [PATCH 18/18] undo loop termination condition fix, and make the
 decision equivalent to the single-thread version.

note: the (single-thread) implementation may have some underlying bug here because the number of output frames is seemingly
inconsistent depending on the combinations of src_ratio, input frames, and number of channels.
The fix should be out of scope for this PR (MultiThreading).
---
 src/src_sinc.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/src_sinc.c b/src/src_sinc.c
index f83576b9..9fee6fb0 100644
--- a/src/src_sinc.c
+++ b/src/src_sinc.c
@@ -661,11 +661,9 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no,
 		/* This is the termination condition. */
 		if (filter->b_real_end >= 0)
 		{
-			#if 1 // perhaps this is the intended termination condition.
-			if (filter->b_current + input_index + terminate + channels - 1 > filter->b_real_end)
-				break;
-
-			#else // this one matches to the current (0.22) single-thread implementation, but it is seemingly a bug.
+			// This switching is necessary to match the outputs to the current (0.22) single-thread implementation.
+			// However, the (single-thread) implementation may have some underlying bug because the number of output frames is seemingly 
+			// inconsistent depending on the combinations of src_ratio, input frames, and number of channels.
 			if (channels == 1)
 			{
 				if (filter->b_current + input_index + terminate > filter->b_real_end)
@@ -676,7 +674,6 @@ _sinc_multichan_vari_process_mt(const int num_of_threads, const int child_no,
 				if (filter->b_current + input_index + terminate >= filter->b_real_end)
 					break;
 			}
-			#endif
 		};
 
 		double scale, float_increment;