From 7fc4c0feef1cd29cc2aebdb82935e7154ddf204e Mon Sep 17 00:00:00 2001
From: Hansem Ro <hansemro@outlook.com>
Date: Thu, 17 Aug 2023 01:08:06 -0700
Subject: [PATCH 1/3] Add V2/V4 Siglent BIN import filter

This allows importing of analog/math/digital waveforms from V2/V4
Siglent BIN waveform formats.

Siglent scopes that export V2 BIN format:
- SDS2000X Plus with FW 1.2.6 to 1.3.9RX
- SDS5000X with FW 0.8.6 to 0.9.6
- SDS6000 with FW older than 1.4.1.0

Siglent scopes that export V4 BIN format:
- SDS2000X Plus with FW newer than 1.4.0
- SDS5000X with FW newer than 0.9.6
- SDS6000 with FW newer than 1.4.1.0
- SDS2000X HD

V2 and V4 spec details can be found on page 29 and 37 of E02A document,
respectively.

E02A: https://web.archive.org/web/20230730072643/https://www.siglenteu.com/wp-content/uploads/2021/08/How-to-Extract-Data-from-the-Binary-File.pdf
---
 scopeprotocols/CMakeLists.txt             |   1 +
 scopeprotocols/SiglentBINImportFilter.cpp | 281 ++++++++++++++++++++++
 scopeprotocols/SiglentBINImportFilter.h   | 103 ++++++++
 scopeprotocols/scopeprotocols.cpp         |   1 +
 scopeprotocols/scopeprotocols.h           |   1 +
 5 files changed, 387 insertions(+)
 create mode 100644 scopeprotocols/SiglentBINImportFilter.cpp
 create mode 100644 scopeprotocols/SiglentBINImportFilter.h

diff --git a/scopeprotocols/CMakeLists.txt b/scopeprotocols/CMakeLists.txt
index 8e6db270..5283b94f 100644
--- a/scopeprotocols/CMakeLists.txt
+++ b/scopeprotocols/CMakeLists.txt
@@ -131,6 +131,7 @@ set(SCOPEPROTOCOLS_SOURCES
 	SDCmdDecoder.cpp
 	SDDataDecoder.cpp
 	SDRAMDecoderBase.cpp
+	SiglentBINImportFilter.cpp
 	SNRFilter.cpp
 	SParameterCascadeFilter.cpp
 	SParameterDeEmbedFilter.cpp
diff --git a/scopeprotocols/SiglentBINImportFilter.cpp b/scopeprotocols/SiglentBINImportFilter.cpp
new file mode 100644
index 00000000..5a8ef536
--- /dev/null
+++ b/scopeprotocols/SiglentBINImportFilter.cpp
@@ -0,0 +1,281 @@
+/***********************************************************************************************************************
+*                                                                                                                      *
+* libscopeprotocols                                                                                                    *
+*                                                                                                                      *
+* Copyright (c) 2012-2023 Andrew D. Zonenberg and contributors                                                         *
+* All rights reserved.                                                                                                 *
+*                                                                                                                      *
+* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the     *
+* following conditions are met:                                                                                        *
+*                                                                                                                      *
+*    * Redistributions of source code must retain the above copyright notice, this list of conditions, and the         *
+*      following disclaimer.                                                                                           *
+*                                                                                                                      *
+*    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the       *
+*      following disclaimer in the documentation and/or other materials provided with the distribution.                *
+*                                                                                                                      *
+*    * Neither the name of the author nor the names of any contributors may be used to endorse or promote products     *
+*      derived from this software without specific prior written permission.                                           *
+*                                                                                                                      *
+* THIS SOFTWARE IS PROVIDED BY THE AUTHORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED   *
+* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL *
+* THE AUTHORS BE HELD LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES        *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR       *
+* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE       *
+* POSSIBILITY OF SUCH DAMAGE.                                                                                          *
+*                                                                                                                      *
+***********************************************************************************************************************/
+
+#include "../scopehal/scopehal.h"
+#include "SiglentBINImportFilter.h"
+
+using namespace std;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Construction / destruction
+
+SiglentBINImportFilter::SiglentBINImportFilter(const string& color)
+	: ImportFilter(color)
+{
+	m_fpname = "Siglent (V2/V4) BIN File";
+	m_parameters[m_fpname] = FilterParameter(FilterParameter::TYPE_FILENAME, Unit(Unit::UNIT_COUNTS));
+	m_parameters[m_fpname].m_fileFilterMask = "*.bin";
+	m_parameters[m_fpname].m_fileFilterName = "V2/V4 Siglent binary waveform files (*.bin)";
+	m_parameters[m_fpname].signal_changed().connect(sigc::mem_fun(*this, &SiglentBINImportFilter::OnFileNameChanged));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Accessors
+
+string SiglentBINImportFilter::GetProtocolName()
+{
+	return "Siglent (V2/V4) BIN Import";
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Actual decoder logic
+
+void SiglentBINImportFilter::OnFileNameChanged()
+{
+	//Wipe anything we may have had in the past
+	ClearStreams();
+
+	auto fname = m_parameters[m_fpname].ToString();
+	if(fname.empty())
+		return;
+
+	//Set waveform timestamp to file timestamp
+	time_t timestamp = 0;
+	int64_t fs = 0;
+	GetTimestampOfFile(fname, timestamp, fs);
+
+	string f = ReadFile(fname);
+	uint32_t fpos = 0;
+
+	FileHeader fh;
+	f.copy((char*)&fh, sizeof(FileHeader), fpos);
+	fpos += sizeof(FileHeader);
+
+	switch(fh.version)
+	{
+		case 2:
+			break;
+		case 4:
+			fpos += 4;
+			break;
+		default:
+			LogError("Unsupported version (%d) in file header\n", fh.version);
+			return;
+	}
+
+	LogDebug("Version: %d\n", fh.version);
+
+	//Parse waveform header
+	WaveHeader wh;
+	f.copy((char*)&wh, sizeof(WaveHeader), fpos);
+	fpos += sizeof(WaveHeader);
+
+	for(int i = 0; i < 4; i++)
+	{
+		LogDebug("ch%d_en: %d\n", i+1, wh.ch_en[i]);
+		LogDebug("ch%d_v_gain: %f\n", i+1, wh.ch_v_gain[i].value);
+		LogDebug("ch%d_v_offset: %f\n", i+1, wh.ch_v_offset[i].value);
+		LogDebug("ch%d_probe: %f\n", i+1, wh.ch_probe[i]);
+		LogDebug("ch%d_codes_per_div: %d\n", i+1, wh.ch_codes_per_div[i]);
+	}
+
+	LogDebug("digital_en: %d\n", wh.digital_en);
+	for(int i = 0; i < 16; i++)
+	{
+		LogDebug("d%d_ch_en: %d\n", i, wh.d_ch_en[i]);
+	}
+
+	LogDebug("time_div: %f\n", wh.time_div);
+	LogDebug("time_delay: %f\n", wh.time_delay);
+	LogDebug("wave_length: %d\n", wh.wave_length);
+	LogDebug("s_rate: %f\n", wh.s_rate);
+	LogDebug("d_wave_length: %d\n", wh.d_wave_length);
+	LogDebug("d_s_rate: %f\n", wh.d_s_rate);
+
+	LogDebug("data_width: %d\n", wh.data_width);
+	LogDebug("byte_order: %d\n", wh.byte_order);
+	LogDebug("num_hori_div: %d\n", wh.num_hori_div);
+
+	for(int i = 0; i < 4; i++)
+	{
+		LogDebug("math%d_en: %d\n", i+1, wh.math_en[i]);
+		LogDebug("math%d_v_gain: %f\n", i+1, wh.math_v_gain[i].value);
+		LogDebug("math%d_v_offset: %f\n", i+1, wh.math_v_offset[i].value);
+		LogDebug("math%d_wave_length: %d\n", i+1, wh.math_wave_length[i]);
+		LogDebug("math%d_s_interval: %f\n", i+1, wh.math_s_interval[i]);
+	}
+	LogDebug("math_codes_per_div: %d\n", wh.math_codes_per_div);
+
+	switch(fh.version)
+	{
+		case 2:
+			fpos = 0x800;
+			break;
+		case 4:
+			fpos = 0x1000;
+			break;
+		default:
+			LogError("Unsupported version (%d) in file header\n", fh.version);
+			return;
+	}
+
+	//Process analog data
+	uint32_t data_width = wh.data_width + 1; // number of bytes
+	int32_t center_code = (1 << (8*data_width - 1)) - 1;
+
+	uint32_t wave_idx = 0;
+	for(int i = 0; i < 4; i++)
+	{
+		if(wh.ch_en[i] == 1)
+		{
+			string name = string("C") + to_string(i+1);
+			AddStream(Unit(Unit::UNIT_VOLTS), name, Stream::STREAM_TYPE_ANALOG);
+			auto wfm = new UniformAnalogWaveform;
+			wfm->m_timescale = round(FS_PER_SECOND / wh.s_rate);
+			wfm->m_startTimestamp = timestamp * FS_PER_SECOND;
+			wfm->m_startFemtoseconds = fs;
+			wfm->m_triggerPhase = 0;
+			wfm->PrepareForCpuAccess();
+			SetData(wfm, m_streams.size() - 1);
+
+			LogDebug("Waveform[%d]: %s\n", wave_idx, name.c_str());
+			double v_gain = wh.ch_v_gain[i].value * wh.ch_probe[i] / wh.ch_codes_per_div[i];
+			LogDebug("\tv_gain: %f\n", v_gain);
+			LogDebug("\tcenter: %d\n", center_code);
+
+			if(data_width == 2)
+			{
+				for(size_t j = 0; j < wh.wave_length; j++)
+				{
+					const uint16_t* sample = reinterpret_cast<const uint16_t*>(f.c_str() + fpos);
+					float value = ((static_cast<int32_t>(*sample) - center_code)) * v_gain - wh.ch_v_offset[i].value;
+					wfm->m_samples.push_back(value);
+					fpos += 2;
+				}
+			}
+			else
+			{
+				for(size_t j = 0; j < wh.wave_length; j++)
+				{
+					const uint8_t* sample = reinterpret_cast<const uint8_t*>(f.c_str() + fpos);
+					float value = (static_cast<int32_t>(*sample) - center_code) * v_gain - wh.ch_v_offset[i].value;
+					wfm->m_samples.push_back(value);
+					fpos += 1;
+				}
+			}
+
+			wfm->MarkModifiedFromCpu();
+			wave_idx += 1;
+		}
+	}
+
+	//Process math data
+	for(int i = 0; i < 4; i++)
+	{
+		if(wh.math_en[i] == 1)
+		{
+			string name = string("F") + to_string(i+1);
+			AddStream(Unit(Unit::UNIT_VOLTS), name, Stream::STREAM_TYPE_ANALOG);
+			auto wfm = new UniformAnalogWaveform;
+			wfm->m_timescale = round(wh.math_s_interval[i] * FS_PER_SECOND);
+			wfm->m_startTimestamp = timestamp * FS_PER_SECOND;
+			wfm->m_startFemtoseconds = fs;
+			wfm->m_triggerPhase = 0;
+			wfm->PrepareForCpuAccess();
+			SetData(wfm, m_streams.size() - 1);
+
+			LogDebug("Waveform[%d]: %s\n", wave_idx, name.c_str());
+			double v_gain = wh.math_v_gain[i].value / wh.math_codes_per_div;
+			LogDebug("\tv_gain: %f\n", v_gain);
+			LogDebug("\tcenter: %d\n", center_code);
+
+			if(data_width == 2)
+			{
+				for(size_t j = 0; j < wh.math_wave_length[i]; j++)
+				{
+					const uint16_t* sample = reinterpret_cast<const uint16_t*>(f.c_str() + fpos);
+					float value = ((static_cast<int32_t>(*sample) - center_code)) * v_gain - wh.math_v_offset[i].value;
+					wfm->m_samples.push_back(value);
+					fpos += 2;
+				}
+			}
+			else
+			{
+				for(size_t j = 0; j < wh.math_wave_length[i]; j++)
+				{
+					const uint8_t* sample = reinterpret_cast<const uint8_t*>(f.c_str() + fpos);
+					float value = (static_cast<int32_t>(*sample) - center_code) * v_gain - wh.math_v_offset[i].value;
+					wfm->m_samples.push_back(value);
+					fpos += 1;
+				}
+			}
+
+			wfm->MarkModifiedFromCpu();
+			wave_idx += 1;
+		}
+	}
+
+	//Process digital data
+	if(wh.digital_en)
+	{
+		for(int i = 0; i < 16; i++)
+		{
+			if(wh.d_ch_en[i] == 1)
+			{
+				string name = string("D") + to_string(i);
+				AddStream(Unit(Unit::UNIT_VOLTS), name, Stream::STREAM_TYPE_DIGITAL);
+				auto wfm = new UniformDigitalWaveform;
+				wfm->m_timescale = round(FS_PER_SECOND / wh.d_s_rate);
+				wfm->m_startTimestamp = timestamp * FS_PER_SECOND;
+				wfm->m_startFemtoseconds = fs;
+				wfm->m_triggerPhase = 0;
+				wfm->PrepareForCpuAccess();
+				SetData(wfm, m_streams.size() - 1);
+
+				LogDebug("Waveform[%d]: %s\n", wave_idx, name.c_str());
+				for(size_t j = 0; j < (wh.d_wave_length / 8); j++)
+				{
+					uint8_t samples = *reinterpret_cast<const uint8_t*>(f.c_str() + fpos);
+					for(int k = 0; k < 8; k++)
+					{
+						bool value = samples & 0x1;
+						samples >>= 1;
+						wfm->m_samples.push_back(value);
+					}
+					fpos += 1;
+				}
+
+				wfm->MarkModifiedFromCpu();
+				wave_idx += 1;
+			}
+		}
+	}
+
+	m_outputsChangedSignal.emit();
+}
diff --git a/scopeprotocols/SiglentBINImportFilter.h b/scopeprotocols/SiglentBINImportFilter.h
new file mode 100644
index 00000000..03a8a9bf
--- /dev/null
+++ b/scopeprotocols/SiglentBINImportFilter.h
@@ -0,0 +1,103 @@
+/***********************************************************************************************************************
+*                                                                                                                      *
+* libscopeprotocols                                                                                                    *
+*                                                                                                                      *
+* Copyright (c) 2012-2023 Andrew D. Zonenberg and contributors                                                         *
+* All rights reserved.                                                                                                 *
+*                                                                                                                      *
+* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the     *
+* following conditions are met:                                                                                        *
+*                                                                                                                      *
+*    * Redistributions of source code must retain the above copyright notice, this list of conditions, and the         *
+*      following disclaimer.                                                                                           *
+*                                                                                                                      *
+*    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the       *
+*      following disclaimer in the documentation and/or other materials provided with the distribution.                *
+*                                                                                                                      *
+*    * Neither the name of the author nor the names of any contributors may be used to endorse or promote products     *
+*      derived from this software without specific prior written permission.                                           *
+*                                                                                                                      *
+* THIS SOFTWARE IS PROVIDED BY THE AUTHORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED   *
+* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL *
+* THE AUTHORS BE HELD LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES        *
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR       *
+* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE       *
+* POSSIBILITY OF SUCH DAMAGE.                                                                                          *
+*                                                                                                                      *
+***********************************************************************************************************************/
+
+/**
+	@file
+	@author Andrew D. Zonenberg
+	@brief Declaration of SiglentBINImportFilter
+ */
+#ifndef SiglentBINImportFilter_h
+#define SiglentBINImportFilter_h
+
+class SiglentBINImportFilter : public ImportFilter
+{
+public:
+	SiglentBINImportFilter(const std::string& color);
+
+	static std::string GetProtocolName();
+
+	PROTOCOL_DECODER_INITPROC(SiglentBINImportFilter)
+
+	//Siglent binary capture structs
+	#pragma pack(push, 1)
+	struct FileHeader
+	{
+		uint32_t version;			//File format version
+	};
+
+	// V2/V4 wave header
+	struct WaveHeader
+	{
+		int32_t ch_en[4];			//C1-C4 channel enable
+		struct {					//C1-C4 vertical gain
+			double value;
+			char reserved[32];
+		} ch_v_gain[4];
+		struct {					//C1-C4 vertical offset
+			double value;
+			char reserved[32];
+		} ch_v_offset[4];
+		int32_t digital_en;			//Digital enable
+		int32_t d_ch_en[16];		//D0-D15 channel enable
+		double time_div;			//Time base
+		char reserved9[32];
+		double time_delay;			//Trigger delay
+		char reserved10[32];
+		uint32_t wave_length;		//Number of samples in each analog waveform
+		double s_rate;				//C1-C4 sampling rate
+		char reserved11[32];
+		uint32_t d_wave_length;		//Number of samples in each digital waveform
+		double d_s_rate;			//D0-D15 sampling rate
+		char reserved12[32];
+		double ch_probe[4];			//C1-C4 probe factor
+		int8_t data_width;			//0:1 Byte, 1:2 Bytes
+		int8_t byte_order;			//0:LSB, 1:MSB
+		char reserved13[6];
+		int32_t num_hori_div;		//Number of horizontal divisions
+		int32_t ch_codes_per_div[4];//C1-C4 codes per division
+		int32_t math_en[4];			//F1-F4 channel enable
+		struct {					//F1-F4 vertical gain
+			double value;
+			char reserved[32];
+		} math_v_gain[4];
+		struct {					//F1-F2 vertical offset
+			double value;
+			char reserved[32];
+		} math_v_offset[4];
+		uint32_t math_wave_length[4];//F1-F4 number of samples
+		double math_s_interval[4];	//F1-F4 sampling interval
+		int32_t math_codes_per_div;	//F1-F4 codes per division
+	};
+	#pragma pack(pop)
+
+protected:
+	void OnFileNameChanged();
+};
+
+#endif
diff --git a/scopeprotocols/scopeprotocols.cpp b/scopeprotocols/scopeprotocols.cpp
index daed916b..642d94d5 100644
--- a/scopeprotocols/scopeprotocols.cpp
+++ b/scopeprotocols/scopeprotocols.cpp
@@ -165,6 +165,7 @@ void ScopeProtocolStaticInit()
 	AddDecoderClass(ScaleFilter);
 	AddDecoderClass(SDCmdDecoder);
 	AddDecoderClass(SDDataDecoder);
+	AddDecoderClass(SiglentBINImportFilter);
 	AddDecoderClass(SNRFilter);
 	AddDecoderClass(SParameterCascadeFilter);
 	AddDecoderClass(SParameterDeEmbedFilter);
diff --git a/scopeprotocols/scopeprotocols.h b/scopeprotocols/scopeprotocols.h
index 228bacb5..76efccc2 100644
--- a/scopeprotocols/scopeprotocols.h
+++ b/scopeprotocols/scopeprotocols.h
@@ -165,6 +165,7 @@
 #include "ScaleFilter.h"
 #include "SDCmdDecoder.h"
 #include "SDDataDecoder.h"
+#include "SiglentBINImportFilter.h"
 #include "SNRFilter.h"
 #include "SParameterCascadeFilter.h"
 #include "SParameterDeEmbedFilter.h"

From 4cbd793bac82234743cb55a43007be0dbc95677a Mon Sep 17 00:00:00 2001
From: Hansem Ro <hansemro@outlook.com>
Date: Tue, 22 Aug 2023 16:51:50 -0700
Subject: [PATCH 2/3] SiglentBINImportFilter: AVX2 + FMA implementation for
 analog/math channels

This also defines Oscilloscope::ConvertUnsigned16BitSamples with
generic, AVX2, and FMA implementations.
---
 scopehal/Oscilloscope.cpp                 | 218 ++++++++++++++++++++++
 scopehal/Oscilloscope.h                   |   7 +
 scopeprotocols/SiglentBINImportFilter.cpp |  58 +++---
 3 files changed, 255 insertions(+), 28 deletions(-)

diff --git a/scopehal/Oscilloscope.cpp b/scopehal/Oscilloscope.cpp
index 877825a0..4c94faf0 100644
--- a/scopehal/Oscilloscope.cpp
+++ b/scopehal/Oscilloscope.cpp
@@ -1277,3 +1277,221 @@ void Oscilloscope::Convert16BitSamplesAVX512F(float* pout, int16_t* pin, float g
 }
 #endif /* __x86_64__ */
 
+/**
+	@brief Converts Unsigned 16-bit ADC samples to floating point
+ */
+void Oscilloscope::ConvertUnsigned16BitSamples(float* pout, uint16_t* pin, float gain, float offset, size_t count)
+{
+	//Divide large waveforms (>1M points) into blocks and multithread them
+	//TODO: tune split
+	if(count > 1000000)
+	{
+		//Round blocks to multiples of 64 samples for clean vectorization
+		size_t numblocks = omp_get_max_threads();
+		size_t lastblock = numblocks - 1;
+		size_t blocksize = count / numblocks;
+		blocksize = blocksize - (blocksize % 64);
+
+		#pragma omp parallel for
+		for(size_t i=0; i<numblocks; i++)
+		{
+			//Last block gets any extra that didn't divide evenly
+			size_t nsamp = blocksize;
+			if(i == lastblock)
+				nsamp = count - i*blocksize;
+
+			size_t off = i*blocksize;
+			#ifdef __x86_64__
+			if(g_hasAvx2)
+			{
+				if(g_hasFMA)
+				{
+					ConvertUnsigned16BitSamplesFMA(
+						pout + off,
+						pin + off,
+						gain,
+						offset,
+						nsamp);
+				}
+				else
+				{
+					ConvertUnsigned16BitSamplesAVX2(
+						pout + off,
+						pin + off,
+						gain,
+						offset,
+						nsamp);
+				}
+			}
+			else
+			#endif /* __x86_64__ */
+			{
+				ConvertUnsigned16BitSamplesGeneric(
+					pout + off,
+					pin + off,
+					gain,
+					offset,
+					nsamp);
+			}
+		}
+	}
+
+	//Small waveforms get done single threaded to avoid overhead
+	else
+	{
+		#ifdef __x86_64__
+		if(g_hasAvx2)
+		{
+			if(g_hasFMA)
+				ConvertUnsigned16BitSamplesFMA(pout, pin, gain, offset, count);
+			else
+				ConvertUnsigned16BitSamplesAVX2(pout, pin, gain, offset, count);
+		}
+		else
+		#endif /* __x86_64__ */
+			ConvertUnsigned16BitSamplesGeneric(pout, pin, gain, offset, count);
+	}
+}
+
+/**
+	@brief Generic backend for ConvertUnsigned16BitSamples()
+ */
+void Oscilloscope::ConvertUnsigned16BitSamplesGeneric(float* pout, uint16_t* pin, float gain, float offset, size_t count)
+{
+	for(unsigned int k=0; k<count; k++)
+		pout[k] = pin[k] * gain - offset;
+}
+
+#ifdef __x86_64__
+__attribute__((target("avx2")))
+void Oscilloscope::ConvertUnsigned16BitSamplesAVX2(float* pout, uint16_t* pin, float gain, float offset, size_t count)
+{
+	size_t end = count - (count % 32);
+
+	__m256 gains = { gain, gain, gain, gain, gain, gain, gain, gain };
+	__m256 offsets = { offset, offset, offset, offset, offset, offset, offset, offset };
+
+	for(size_t k=0; k<end; k += 32)
+	{
+		//Load all 32 raw ADC samples, without assuming alignment
+		//(on most modern Intel processors, load and loadu have same latency/throughput)
+		__m256i raw_samples1 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(pin + k));
+		__m256i raw_samples2 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(pin + k + 16));
+
+		//Extract the low and high halves (8 samples each) from the input blocks
+		__m128i block0_u16 = _mm256_extracti128_si256(raw_samples1, 0);
+		__m128i block1_u16 = _mm256_extracti128_si256(raw_samples1, 1);
+		__m128i block2_u16 = _mm256_extracti128_si256(raw_samples2, 0);
+		__m128i block3_u16 = _mm256_extracti128_si256(raw_samples2, 1);
+
+		//Convert both blocks from unsigned 16-bit to signed 32-bit, giving us a pair of 8x int32 vectors
+		__m256i block0_i32 = _mm256_cvtepu16_epi32(block0_u16);
+		__m256i block1_i32 = _mm256_cvtepu16_epi32(block1_u16);
+		__m256i block2_i32 = _mm256_cvtepu16_epi32(block2_u16);
+		__m256i block3_i32 = _mm256_cvtepu16_epi32(block3_u16);
+
+		//Convert the 32-bit int blocks to fp32
+		//Sadly there's no direct epi16 to ps conversion instruction.
+		__m256 block0_float = _mm256_cvtepi32_ps(block0_i32);
+		__m256 block1_float = _mm256_cvtepi32_ps(block1_i32);
+		__m256 block2_float = _mm256_cvtepi32_ps(block2_i32);
+		__m256 block3_float = _mm256_cvtepi32_ps(block3_i32);
+
+		//Woo! We've finally got floating point data. Now we can do the fun part.
+		block0_float = _mm256_mul_ps(block0_float, gains);
+		block1_float = _mm256_mul_ps(block1_float, gains);
+		block2_float = _mm256_mul_ps(block2_float, gains);
+		block3_float = _mm256_mul_ps(block3_float, gains);
+
+		block0_float = _mm256_sub_ps(block0_float, offsets);
+		block1_float = _mm256_sub_ps(block1_float, offsets);
+		block2_float = _mm256_sub_ps(block2_float, offsets);
+		block3_float = _mm256_sub_ps(block3_float, offsets);
+
+		//All done, store back to the output buffer
+		_mm256_store_ps(pout + k, 		block0_float);
+		_mm256_store_ps(pout + k + 8,	block1_float);
+		_mm256_store_ps(pout + k + 16,	block2_float);
+		_mm256_store_ps(pout + k + 24,	block3_float);
+	}
+
+	//Get any extras we didn't get in the SIMD loop
+	for(size_t k=end; k<count; k++)
+		pout[k] = pin[k] * gain - offset;
+}
+
+__attribute__((target("avx2,fma")))
+void Oscilloscope::ConvertUnsigned16BitSamplesFMA(float* pout, uint16_t* pin, float gain, float offset, size_t count)
+{
+	size_t end = count - (count % 64);
+
+	__m256 gains = { gain, gain, gain, gain, gain, gain, gain, gain };
+	__m256 offsets = { offset, offset, offset, offset, offset, offset, offset, offset };
+
+	for(size_t k=0; k<end; k += 64)
+	{
+		//Load all 64 raw ADC samples, without assuming alignment
+		//(on most modern Intel processors, load and loadu have same latency/throughput)
+		__m256i raw_samples1 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(pin + k));
+		__m256i raw_samples2 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(pin + k + 16));
+		__m256i raw_samples3 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(pin + k + 32));
+		__m256i raw_samples4 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(pin + k + 48));
+
+		//Extract the low and high halves (8 samples each) from the input blocks
+		__m128i block0_u16 = _mm256_extracti128_si256(raw_samples1, 0);
+		__m128i block1_u16 = _mm256_extracti128_si256(raw_samples1, 1);
+		__m128i block2_u16 = _mm256_extracti128_si256(raw_samples2, 0);
+		__m128i block3_u16 = _mm256_extracti128_si256(raw_samples2, 1);
+		__m128i block4_u16 = _mm256_extracti128_si256(raw_samples3, 0);
+		__m128i block5_u16 = _mm256_extracti128_si256(raw_samples3, 1);
+		__m128i block6_u16 = _mm256_extracti128_si256(raw_samples4, 0);
+		__m128i block7_u16 = _mm256_extracti128_si256(raw_samples4, 1);
+
+		//Convert the blocks from unsigned 16-bit to signed 32-bit, giving us a pair of 8x int32 vectors
+		__m256i block0_i32 = _mm256_cvtepu16_epi32(block0_u16);
+		__m256i block1_i32 = _mm256_cvtepu16_epi32(block1_u16);
+		__m256i block2_i32 = _mm256_cvtepu16_epi32(block2_u16);
+		__m256i block3_i32 = _mm256_cvtepu16_epi32(block3_u16);
+		__m256i block4_i32 = _mm256_cvtepu16_epi32(block4_u16);
+		__m256i block5_i32 = _mm256_cvtepu16_epi32(block5_u16);
+		__m256i block6_i32 = _mm256_cvtepu16_epi32(block6_u16);
+		__m256i block7_i32 = _mm256_cvtepu16_epi32(block7_u16);
+
+		//Convert the 32-bit int blocks to fp32
+		//Sadly there's no direct epi16 to ps conversion instruction.
+		__m256 block0_float = _mm256_cvtepi32_ps(block0_i32);
+		__m256 block1_float = _mm256_cvtepi32_ps(block1_i32);
+		__m256 block2_float = _mm256_cvtepi32_ps(block2_i32);
+		__m256 block3_float = _mm256_cvtepi32_ps(block3_i32);
+		__m256 block4_float = _mm256_cvtepi32_ps(block4_i32);
+		__m256 block5_float = _mm256_cvtepi32_ps(block5_i32);
+		__m256 block6_float = _mm256_cvtepi32_ps(block6_i32);
+		__m256 block7_float = _mm256_cvtepi32_ps(block7_i32);
+
+		//Woo! We've finally got floating point data. Now we can do the fun part.
+		block0_float = _mm256_fmsub_ps(block0_float, gains, offsets);
+		block1_float = _mm256_fmsub_ps(block1_float, gains, offsets);
+		block2_float = _mm256_fmsub_ps(block2_float, gains, offsets);
+		block3_float = _mm256_fmsub_ps(block3_float, gains, offsets);
+		block4_float = _mm256_fmsub_ps(block4_float, gains, offsets);
+		block5_float = _mm256_fmsub_ps(block5_float, gains, offsets);
+		block6_float = _mm256_fmsub_ps(block6_float, gains, offsets);
+		block7_float = _mm256_fmsub_ps(block7_float, gains, offsets);
+
+		//All done, store back to the output buffer
+		_mm256_store_ps(pout + k, 		block0_float);
+		_mm256_store_ps(pout + k + 8,	block1_float);
+		_mm256_store_ps(pout + k + 16,	block2_float);
+		_mm256_store_ps(pout + k + 24,	block3_float);
+
+		_mm256_store_ps(pout + k + 32,	block4_float);
+		_mm256_store_ps(pout + k + 40,	block5_float);
+		_mm256_store_ps(pout + k + 48,	block6_float);
+		_mm256_store_ps(pout + k + 56,	block7_float);
+	}
+
+	//Get any extras we didn't get in the SIMD loop
+	for(size_t k=end; k<count; k++)
+		pout[k] = pin[k] * gain - offset;
+}
+#endif /* __x86_64__ */
diff --git a/scopehal/Oscilloscope.h b/scopehal/Oscilloscope.h
index 4cf942ad..11da80f1 100644
--- a/scopehal/Oscilloscope.h
+++ b/scopehal/Oscilloscope.h
@@ -799,6 +799,13 @@ class Oscilloscope : public virtual Instrument
 	static void Convert16BitSamplesAVX512F(float* pout, int16_t* pin, float gain, float offset, size_t count);
 #endif
 
+	static void ConvertUnsigned16BitSamples(float* pout, uint16_t* pin, float gain, float offset, size_t count);
+	static void ConvertUnsigned16BitSamplesGeneric(float* pout, uint16_t* pin, float gain, float offset, size_t count);
+#ifdef __x86_64__
+	static void ConvertUnsigned16BitSamplesAVX2(float* pout, uint16_t* pin, float gain, float offset, size_t count);
+	static void ConvertUnsigned16BitSamplesFMA(float* pout, uint16_t* pin, float gain, float offset, size_t count);
+#endif
+
 public:
 	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 	// Waveform Access
diff --git a/scopeprotocols/SiglentBINImportFilter.cpp b/scopeprotocols/SiglentBINImportFilter.cpp
index 5a8ef536..165139db 100644
--- a/scopeprotocols/SiglentBINImportFilter.cpp
+++ b/scopeprotocols/SiglentBINImportFilter.cpp
@@ -163,6 +163,7 @@ void SiglentBINImportFilter::OnFileNameChanged()
 			wfm->m_triggerPhase = 0;
 			wfm->PrepareForCpuAccess();
 			SetData(wfm, m_streams.size() - 1);
+			wfm->Resize(wh.wave_length);
 
 			LogDebug("Waveform[%d]: %s\n", wave_idx, name.c_str());
 			double v_gain = wh.ch_v_gain[i].value * wh.ch_probe[i] / wh.ch_codes_per_div[i];
@@ -171,23 +172,23 @@ void SiglentBINImportFilter::OnFileNameChanged()
 
 			if(data_width == 2)
 			{
-				for(size_t j = 0; j < wh.wave_length; j++)
-				{
-					const uint16_t* sample = reinterpret_cast<const uint16_t*>(f.c_str() + fpos);
-					float value = ((static_cast<int32_t>(*sample) - center_code)) * v_gain - wh.ch_v_offset[i].value;
-					wfm->m_samples.push_back(value);
-					fpos += 2;
-				}
+				Oscilloscope::ConvertUnsigned16BitSamples(
+					wfm->m_samples.GetCpuPointer(),
+					(uint16_t*)(f.c_str() + fpos),
+					v_gain,
+					v_gain * center_code + wh.ch_v_offset[i].value,
+					wh.wave_length);
+				fpos += 2 * wh.wave_length;
 			}
 			else
 			{
-				for(size_t j = 0; j < wh.wave_length; j++)
-				{
-					const uint8_t* sample = reinterpret_cast<const uint8_t*>(f.c_str() + fpos);
-					float value = (static_cast<int32_t>(*sample) - center_code) * v_gain - wh.ch_v_offset[i].value;
-					wfm->m_samples.push_back(value);
-					fpos += 1;
-				}
+				Oscilloscope::ConvertUnsigned8BitSamples(
+					wfm->m_samples.GetCpuPointer(),
+					(uint8_t*)(f.c_str() + fpos),
+					v_gain,
+					v_gain * center_code + wh.ch_v_offset[i].value,
+					wh.wave_length);
+				fpos += wh.wave_length;
 			}
 
 			wfm->MarkModifiedFromCpu();
@@ -209,6 +210,7 @@ void SiglentBINImportFilter::OnFileNameChanged()
 			wfm->m_triggerPhase = 0;
 			wfm->PrepareForCpuAccess();
 			SetData(wfm, m_streams.size() - 1);
+			wfm->Resize(wh.math_wave_length[i]);
 
 			LogDebug("Waveform[%d]: %s\n", wave_idx, name.c_str());
 			double v_gain = wh.math_v_gain[i].value / wh.math_codes_per_div;
@@ -217,23 +219,23 @@ void SiglentBINImportFilter::OnFileNameChanged()
 
 			if(data_width == 2)
 			{
-				for(size_t j = 0; j < wh.math_wave_length[i]; j++)
-				{
-					const uint16_t* sample = reinterpret_cast<const uint16_t*>(f.c_str() + fpos);
-					float value = ((static_cast<int32_t>(*sample) - center_code)) * v_gain - wh.math_v_offset[i].value;
-					wfm->m_samples.push_back(value);
-					fpos += 2;
-				}
+				Oscilloscope::ConvertUnsigned16BitSamples(
+					wfm->m_samples.GetCpuPointer(),
+					(uint16_t*)(f.c_str() + fpos),
+					v_gain,
+					v_gain * center_code + wh.math_v_offset[i].value,
+					wh.math_wave_length[i]);
+				fpos += 2 * wh.math_wave_length[i];
 			}
 			else
 			{
-				for(size_t j = 0; j < wh.math_wave_length[i]; j++)
-				{
-					const uint8_t* sample = reinterpret_cast<const uint8_t*>(f.c_str() + fpos);
-					float value = (static_cast<int32_t>(*sample) - center_code) * v_gain - wh.math_v_offset[i].value;
-					wfm->m_samples.push_back(value);
-					fpos += 1;
-				}
+				Oscilloscope::ConvertUnsigned8BitSamples(
+					wfm->m_samples.GetCpuPointer(),
+					(uint8_t*)(f.c_str() + fpos),
+					v_gain,
+					v_gain * center_code + wh.math_v_offset[i].value,
+					wh.math_wave_length[i]);
+				fpos += wh.math_wave_length[i];
 			}
 
 			wfm->MarkModifiedFromCpu();

From 4bfaf83c696304d6a9166045060fe731df11e636 Mon Sep 17 00:00:00 2001
From: Hansem Ro <hansemro@outlook.com>
Date: Tue, 22 Aug 2023 19:15:45 -0700
Subject: [PATCH 3/3] Add AVX512 implementation of ConvertUnsigned16BitSamples

---
 scopehal/Oscilloscope.cpp | 65 +++++++++++++++++++++++++++++++++++++--
 scopehal/Oscilloscope.h   |  1 +
 2 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/scopehal/Oscilloscope.cpp b/scopehal/Oscilloscope.cpp
index 4c94faf0..0268df0b 100644
--- a/scopehal/Oscilloscope.cpp
+++ b/scopehal/Oscilloscope.cpp
@@ -1302,7 +1302,16 @@ void Oscilloscope::ConvertUnsigned16BitSamples(float* pout, uint16_t* pin, float
 
 			size_t off = i*blocksize;
 			#ifdef __x86_64__
-			if(g_hasAvx2)
+			if(g_hasAvx512F)
+			{
+				ConvertUnsigned16BitSamplesAVX512F(
+					pout + off,
+					pin + off,
+					gain,
+					offset,
+					nsamp);
+			}
+			else if(g_hasAvx2)
 			{
 				if(g_hasFMA)
 				{
@@ -1384,7 +1393,7 @@ void Oscilloscope::ConvertUnsigned16BitSamplesAVX2(float* pout, uint16_t* pin, f
 		__m128i block2_u16 = _mm256_extracti128_si256(raw_samples2, 0);
 		__m128i block3_u16 = _mm256_extracti128_si256(raw_samples2, 1);
 
-		//Convert both blocks from unsigned 16-bit to signed 32-bit, giving us a pair of 8x int32 vectors
+		//Convert the blocks from unsigned 16-bit to signed 32-bit, giving us a pair of 8x int32 vectors
 		__m256i block0_i32 = _mm256_cvtepu16_epi32(block0_u16);
 		__m256i block1_i32 = _mm256_cvtepu16_epi32(block1_u16);
 		__m256i block2_i32 = _mm256_cvtepu16_epi32(block2_u16);
@@ -1494,4 +1503,56 @@ void Oscilloscope::ConvertUnsigned16BitSamplesFMA(float* pout, uint16_t* pin, fl
 	for(size_t k=end; k<count; k++)
 		pout[k] = pin[k] * gain - offset;
 }
+
+__attribute__((target("avx512f")))
+void Oscilloscope::ConvertUnsigned16BitSamplesAVX512F(float* pout, uint16_t* pin, float gain, float offset, size_t count)
+{
+	size_t end = count - (count % 64);
+
+	__m512 gains = _mm512_set1_ps(gain);
+	__m512 offsets = _mm512_set1_ps(offset);
+
+	for(size_t k=0; k<end; k += 64)
+	{
+		//Load all 64 raw ADC samples, without assuming alignment
+		//(on most modern Intel processors, load and loadu have same latency/throughput)
+		__m512i raw_samples1 = _mm512_loadu_si512(reinterpret_cast<__m512i*>(pin + k));
+		__m512i raw_samples2 = _mm512_loadu_si512(reinterpret_cast<__m512i*>(pin + k + 32));
+
+		//Extract the high and low halves (16 samples each) from the input blocks
+		__m256i block0_u16 = _mm512_extracti64x4_epi64(raw_samples1, 0);
+		__m256i block1_u16 = _mm512_extracti64x4_epi64(raw_samples1, 1);
+		__m256i block2_u16 = _mm512_extracti64x4_epi64(raw_samples2, 0);
+		__m256i block3_u16 = _mm512_extracti64x4_epi64(raw_samples2, 1);
+
+		//Convert the blocks from unsigned 16-bit to signed 32-bit, giving us a pair of 16x int32 vectors
+		__m512i block0_i32 = _mm512_cvtepu16_epi32(block0_u16);
+		__m512i block1_i32 = _mm512_cvtepu16_epi32(block1_u16);
+		__m512i block2_i32 = _mm512_cvtepu16_epi32(block2_u16);
+		__m512i block3_i32 = _mm512_cvtepu16_epi32(block3_u16);
+
+		//Convert the 32-bit int blocks to fp32
+		//Sadly there's no direct epi16 to ps conversion instruction.
+		__m512 block0_float = _mm512_cvtepi32_ps(block0_i32);
+		__m512 block1_float = _mm512_cvtepi32_ps(block1_i32);
+		__m512 block2_float = _mm512_cvtepi32_ps(block2_i32);
+		__m512 block3_float = _mm512_cvtepi32_ps(block3_i32);
+
+		//Woo! We've finally got floating point data. Now we can do the fun part.
+		block0_float = _mm512_fmsub_ps(block0_float, gains, offsets);
+		block1_float = _mm512_fmsub_ps(block1_float, gains, offsets);
+		block2_float = _mm512_fmsub_ps(block2_float, gains, offsets);
+		block3_float = _mm512_fmsub_ps(block3_float, gains, offsets);
+
+		//All done, store back to the output buffer
+		_mm512_store_ps(pout + k, 		block0_float);
+		_mm512_store_ps(pout + k + 16,	block1_float);
+		_mm512_store_ps(pout + k + 32,	block2_float);
+		_mm512_store_ps(pout + k + 48,	block3_float);
+	}
+
+	//Get any extras we didn't get in the SIMD loop
+	for(size_t k=end; k<count; k++)
+		pout[k] = pin[k] * gain - offset;
+}
 #endif /* __x86_64__ */
diff --git a/scopehal/Oscilloscope.h b/scopehal/Oscilloscope.h
index 11da80f1..7eba5189 100644
--- a/scopehal/Oscilloscope.h
+++ b/scopehal/Oscilloscope.h
@@ -804,6 +804,7 @@ class Oscilloscope : public virtual Instrument
 #ifdef __x86_64__
 	static void ConvertUnsigned16BitSamplesAVX2(float* pout, uint16_t* pin, float gain, float offset, size_t count);
 	static void ConvertUnsigned16BitSamplesFMA(float* pout, uint16_t* pin, float gain, float offset, size_t count);
+	static void ConvertUnsigned16BitSamplesAVX512F(float* pout, uint16_t* pin, float gain, float offset, size_t count);
 #endif
 
 public: