Apra-Labs · mraduldubey · Feb 28, 2024 · Dec 22, 2023 · Dec 22, 2023 · Dec 22, 2023
diff --git a/base/CMakeLists.txt b/base/CMakeLists.txt
@@ -1,11 +1,13 @@
 cmake_minimum_required(VERSION 3.22)
 
-OPTION(ENABLE_LINUX "Use this switch to enable LINUX" ON)
+OPTION(ENABLE_LINUX "Use this switch to enable LINUX" OFF)
 OPTION(ENABLE_CUDA "Use this switch to enable CUDA" ON)
 OPTION(ENABLE_ARM64 "Use this switch to enable ARM64" OFF)
-OPTION(ENABLE_WINDOWS "Use this switch to enable WINDOWS" OFF)
+OPTION(ENABLE_WINDOWS "Use this switch to enable WINDOWS" ON)
 
 set(VCPKG_INSTALL_OPTIONS "--clean-after-build")
+set(VCPKG_OVERLAY_PORTS "${CMAKE_CURRENT_SOURCE_DIR}/../thirdparty/custom-overlay")
+
 IF(ENABLE_CUDA)
 	add_compile_definitions(APRA_CUDA_ENABLED)
 ENDIF(ENABLE_CUDA)
@@ -38,8 +40,6 @@ project(APRAPIPES)
 message(STATUS $ENV{PKG_CONFIG_PATH}">>>>>> PKG_CONFIG_PATH")
 
 find_package(PkgConfig REQUIRED)
-
-
 find_package(Boost COMPONENTS system thread filesystem serialization log chrono unit_test_framework REQUIRED)
 find_package(JPEG REQUIRED)
 find_package(OpenCV CONFIG REQUIRED)
@@ -50,6 +50,7 @@ find_package(FFMPEG REQUIRED)
 find_package(ZXing CONFIG REQUIRED)
 find_package(bigint CONFIG REQUIRED)
 find_package(SFML COMPONENTS system window audio graphics CONFIG REQUIRED)
+find_package(whisper CONFIG REQUIRED)
 
 IF(ENABLE_CUDA)
 	if((NOT DEFINED CMAKE_CUDA_ARCHITECTURES) OR (CMAKE_CUDA_ARCHITECTURES STREQUAL ""))
@@ -280,10 +281,9 @@ SET(IP_FILES
 	src/OverlayFactory.h
 	src/OverlayFactory.cpp
 	src/TestSignalGeneratorSrc.cpp
+	src/AudioToTextXForm.cpp 
 )
-
-
-
+
 SET(IP_FILES_H
 	include/HistogramOverlay.h
 	include/CalcHistogramCV.h
@@ -306,10 +306,9 @@ SET(IP_FILES_H
 	include/TextOverlayXForm.h
 	include/ColorConversionXForm.h
 	include/Overlay.h
+	include/AudioToTextXForm.h
 )
 
-
-
 SET(CUDA_CORE_FILES
 	src/apra_cudamalloc_allocator.cu
 	src/apra_cudamallochost_allocator.cu
@@ -561,6 +560,7 @@ SET(UT_FILES
 	test/mp4_dts_strategy_tests.cpp
 	test/overlaymodule_tests.cpp
 	test/testSignalGeneratorSrc_tests.cpp
+	test/audioToTextXform_tests.cpp
 	${ARM64_UT_FILES}
 	${CUDA_UT_FILES}
 )
@@ -607,6 +607,7 @@ target_link_libraries(aprapipesut
   liblzma::liblzma
   bigint::bigint
   sfml-audio
+  whisper::whisper
   )
 
 IF(ENABLE_WINDOWS)

diff --git a/base/include/AudioToTextXForm.h b/base/include/AudioToTextXForm.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include "Module.h"
+
+// size of audio to process should be a parameter. 
+// Cache variable to collect frames for processing
+
+class AudioToTextXFormProps : public ModuleProps
+{
+public:
+	enum DecoderSamplingStrategy {
+		GREEDY,      //WHISPER_SAMPLING_GREEDY
+		BEAM_SEARCH //WHISPER_SAMPLING_BEAM_SEARCH
+	};
+	AudioToTextXFormProps(
+		DecoderSamplingStrategy _samplingStrategy,
+		std::string _modelPath,
+		int _bufferSize) : samplingStrategy(_samplingStrategy),
+		modelPath(_modelPath),
+		bufferSize(_bufferSize)
+	{}
+    DecoderSamplingStrategy samplingStrategy;
+    std::string modelPath;
+	int bufferSize;
+};
+
+class AudioToTextXForm  : public Module
+{
+
+public:
+	AudioToTextXForm(AudioToTextXFormProps _props);
+	virtual ~AudioToTextXForm();
+	bool init();
+	bool term();
+	void setProps(AudioToTextXFormProps& props);
+	AudioToTextXFormProps getProps();
+
+protected:
+	bool process(frame_container& frames);
+	bool processSOS(frame_sp& frame);
+	bool validateInputPins();
+	bool validateOutputPins();
+	void addInputPin(framemetadata_sp& metadata, string& pinId);
+	bool handlePropsChange(frame_sp& frame);
+
+private:
+	void setMetadata(framemetadata_sp& metadata);
+	class Detail;
+	boost::shared_ptr<Detail> mDetail;
+};
diff --git a/base/include/FrameMetadata.h b/base/include/FrameMetadata.h
@@ -50,7 +50,8 @@ class FrameMetadata {
 		HEVC_DATA, //H265
 		MOTION_VECTOR_DATA,
 		OVERLAY_INFO_IMAGE,
-		FACE_LANDMARKS_INFO
+		FACE_LANDMARKS_INFO,
+		TEXT
 	};
 
 	enum MemType

diff --git a/base/include/Mp4WriterSinkUtils.h b/base/include/Mp4WriterSinkUtils.h
@@ -1,4 +1,7 @@
 #include <ctime>
+#include <chrono>
+#include <string>
+#include <boost/filesystem.hpp>
 
 class Mp4WriterSinkUtils
 {

diff --git a/base/src/AudioToTextXForm.cpp b/base/src/AudioToTextXForm.cpp
@@ -0,0 +1,180 @@
+#include "AudioToTextXForm.h"
+#include "FrameMetadata.h"
+#include "FrameMetadataFactory.h"
+#include "Frame.h"
+#include "Logger.h"
+#include "Utils.h"
+#include "whisper.h"
+#include "SFML/Config.hpp"
+
+class AudioToTextXForm::Detail
+{
+public:
+	Detail(AudioToTextXFormProps& _props) : mProps(_props)
+	{
+	}
+	~Detail() {}
+
+	void setProps(AudioToTextXFormProps& props)
+	{
+		mProps = props;
+	}
+
+public:
+	framemetadata_sp mOutputMetadata;
+	std::string mOutputPinId;
+	std::vector<float> inputAudioBuffer;
+	AudioToTextXFormProps mProps;
+	int mFrameType;
+	whisper_context *mWhisperContext = NULL;
+	whisper_full_params mWhisperFullParams;
+	whisper_context_params mWhisperContextParams;
+};
+
+AudioToTextXForm::AudioToTextXForm(AudioToTextXFormProps _props) : Module(TRANSFORM, "AudioToTextXForm", _props)
+{
+	mDetail.reset(new Detail(_props));
+}
+
+AudioToTextXForm::~AudioToTextXForm() {}
+
+bool AudioToTextXForm::validateInputPins()
+{
+	if (getNumberOfInputPins() != 1)
+	{
+		LOG_ERROR << "<" << getId() << ">::validateInputPins size is expected to be 1. Actual<" << getNumberOfInputPins() << ">";
+		return false;
+	}
+
+	framemetadata_sp metadata = getFirstInputMetadata();
+
+	FrameMetadata::FrameType frameType = metadata->getFrameType();
+	if (frameType != FrameMetadata::AUDIO)
+	{
+		LOG_ERROR << "<" << getId() << ">::validateInputPins input frameType is expected to be Audio. Actual<" << frameType << ">";
+		return false;
+	}
+
+	return true;
+}
+
+bool AudioToTextXForm::validateOutputPins()
+{
+	if (getNumberOfOutputPins() != 1)
+	{
+		LOG_ERROR << "<" << getId() << ">::validateOutputPins size is expected to be 1. Actual<" << getNumberOfOutputPins() << ">";
+		return false;
+	}
+
+	framemetadata_sp metadata = getFirstOutputMetadata();
+	FrameMetadata::FrameType frameType = metadata->getFrameType();
+	if (frameType != FrameMetadata::TEXT)
+	{
+		LOG_ERROR << "<" << getId() << ">::validateOutputPins input frameType is expected to be TEXT. Actual<" << frameType << ">";
+		return false;
+	}
+
+	return true;
+}
+
+void AudioToTextXForm::addInputPin(framemetadata_sp& metadata, string& pinId)
+{
+	Module::addInputPin(metadata, pinId);
+	mDetail->mOutputMetadata = framemetadata_sp(new FrameMetadata(FrameMetadata::FrameType::TEXT));
+	mDetail->mOutputMetadata->copyHint(*metadata.get());
+	mDetail->mOutputPinId = addOutputPin(mDetail->mOutputMetadata);
+}
+
+bool AudioToTextXForm::init()
+{
+	//intialize model
+	auto samplingStrategy = whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY;
+	switch (mDetail->mProps.samplingStrategy)
+	{
+		case AudioToTextXFormProps::DecoderSamplingStrategy::GREEDY:
+			samplingStrategy = whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY;
+			break;
+		case AudioToTextXFormProps::DecoderSamplingStrategy::BEAM_SEARCH:
+			samplingStrategy = whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH;
+			break;
+		default:
+			throw AIPException(AIP_FATAL, "Unknown Sampling Strategy");
+	}
+	mDetail->mWhisperFullParams = whisper_full_default_params(samplingStrategy);
+	mDetail->mWhisperContextParams = whisper_context_default_params();
+	mDetail->mWhisperContext = whisper_init_from_file_with_params(mDetail->mProps.modelPath.c_str(), mDetail->mWhisperContextParams);
+	return Module::init();
+}
+
+bool AudioToTextXForm::term()
+{
+	whisper_free_context_params(&mDetail->mWhisperContextParams);
+	whisper_free_params(&mDetail->mWhisperFullParams);
+	whisper_free(mDetail->mWhisperContext);
+	return Module::term();
+}
+
+bool AudioToTextXForm::process(frame_container& frames)
+{
+	auto frame = frames.begin()->second;
+	sf::Int16* constFloatPointer = static_cast<sf::Int16*>(frame->data());
+	int numberOfSamples = frame->size() / 2;
+	for (int index = 0; index < numberOfSamples; index++) {
+		mDetail->inputAudioBuffer.push_back((float)constFloatPointer[index]/ 32768.0f);
+	}
+	if (mDetail->inputAudioBuffer.size() < mDetail->mProps.bufferSize) {
+		return true;
+	}
+	whisper_full(
+		mDetail->mWhisperContext,
+		mDetail->mWhisperFullParams,
+		mDetail->inputAudioBuffer.data(),
+		mDetail->inputAudioBuffer.size()
+	);
+	std::string output = "";
+	const int n_segments = whisper_full_n_segments(mDetail->mWhisperContext);
+	for (int i = 0; i < n_segments; ++i) {
+		const char* text = whisper_full_get_segment_text(mDetail->mWhisperContext, i);
+		output += text;
+	}
+	mDetail->inputAudioBuffer.clear();
+	auto outFrame = makeFrame(output.length());
+	memcpy(outFrame->data(), output.c_str(), output.length());
+	frames.insert(make_pair(mDetail->mOutputPinId, outFrame));
+	send(frames);
+	return true;
+}
+
+void AudioToTextXForm::setMetadata(framemetadata_sp& metadata)
+{
+	if (!metadata->isSet())
+	{
+		return;
+	}
+}
+
+bool AudioToTextXForm::processSOS(frame_sp& frame)
+{
+	auto metadata = frame->getMetadata();
+	setMetadata(metadata);
+	return true;
+}
+
+AudioToTextXFormProps AudioToTextXForm::getProps()
+{
+	fillProps(mDetail->mProps);
+	return mDetail->mProps;
+}
+
+bool AudioToTextXForm::handlePropsChange(frame_sp& frame)
+{
+	AudioToTextXFormProps props(mDetail->mProps.samplingStrategy, mDetail->mProps.modelPath,32000);
+	auto ret = Module::handlePropsChange(frame, props);
+	mDetail->setProps(props);
+	return ret;
+}
+
+void AudioToTextXForm::setProps(AudioToTextXFormProps& props)
+{
+	Module::addPropsToQueue(props);
+}
diff --git a/base/test/audioToTextXform_tests.cpp b/base/test/audioToTextXform_tests.cpp
@@ -0,0 +1,59 @@
+#include "stdafx.h"
+#include <boost/test/unit_test.hpp>
+#include "FrameMetadata.h"
+#include "FrameMetadataFactory.h"
+#include "Frame.h"
+#include "Logger.h"
+#include "test_utils.h"
+#include "PipeLine.h"
+#include "FileWriterModule.h"
+#include "FileReaderModule.h"
+#include "FileWriterModule.h"
+#include "AudioToTextXForm.h"
+#include "Module.h"
+#include<iostream>
+#include<fstream>
+#include<vector>
+
+BOOST_AUTO_TEST_SUITE(audioToTextXform_test)
+
+BOOST_AUTO_TEST_CASE(test_asr, *boost::unit_test::enabled())
+{
+    std::vector<std::string> asrOutText = { "./data/asr_out.txt" };
+    Test_Utils::FileCleaner f(asrOutText);
+
+	Logger::setLogLevel(boost::log::trivial::severity_level::info);
+
+    // This is a PCM file without WAV header
+    auto fileReaderProps = FileReaderModuleProps("./data/audioToTextXform_test.pcm");
+    fileReaderProps.readLoop = false;
+    auto fileReader = boost::shared_ptr<FileReaderModule>(new FileReaderModule(fileReaderProps));
+    auto metadata = framemetadata_sp(new FrameMetadata(FrameMetadata::AUDIO));
+    auto pinId = fileReader->addOutputPin(metadata);
+
+    auto asr = boost::shared_ptr<Module>(new AudioToTextXForm(AudioToTextXFormProps(
+        AudioToTextXFormProps::DecoderSamplingStrategy::GREEDY
+        ,"./data/whisper/models/ggml-tiny.en-q8_0.bin",18000)));
+    fileReader->setNext(asr);
+
+    auto outputFile = boost::shared_ptr<Module>(new FileWriterModule(FileWriterModuleProps(asrOutText[0], false)));
+    asr->setNext(outputFile);
+
+    PipeLine p("test");
+    p.appendModule(fileReader);
+    p.init();
+    p.run_all_threaded();
+    boost::this_thread::sleep_for(boost::chrono::milliseconds(10000));  // giving time to call step 
+    std::ifstream in_file_text(asrOutText[0]);
+    std::stringstream buffer;
+    buffer << in_file_text.rdbuf();
+    BOOST_TEST(
+        (buffer.str() == " The Matic speech recognition also known as ASR is the use of machine learning"
+            "or artificial intelligence technology to process human speech into readable text."));
+    p.stop();
+    p.term();
+    p.wait_for_all();
+    in_file_text.close();
+}
+
+BOOST_AUTO_TEST_SUITE_END()