Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate Whisper CPP and write a wrapper module in Aprapipes #324

Merged
merged 43 commits into from
Feb 28, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
8e23a6e
Add custom port vcpkg for whisper
joiskash Dec 22, 2023
fb29351
Add whisper stream
joiskash Dec 22, 2023
4ed21e2
Add whisper stream header
joiskash Dec 22, 2023
deab2d0
Add whisper cpp to Cmake list
joiskash Dec 22, 2023
9462f59
Add test frame type and minor changes
joiskash Dec 22, 2023
c55c40b
Add whisper to vcpkg
joiskash Dec 27, 2023
870862c
Add vcpkg custom overlay ports to thirdparty
joiskash Dec 27, 2023
ca4a6e4
Modify with whisper option
joiskash Dec 27, 2023
482f02c
Send whisper output as text frames
joiskash Dec 27, 2023
d12edf5
revert changes to sound record test
joiskash Dec 27, 2023
3275afd
Add whisper UT
joiskash Dec 27, 2023
acd8a1f
Fix PS to remove whisper from vcpkg json
joiskash Dec 27, 2023
9b18eb3
Revert changes to OPTIONS section, remove WHISPER option, rename Whis…
joiskash Dec 31, 2023
cf5d8a4
Move pcm to git lfs
joiskash Dec 31, 2023
5ad9157
Add pcm and model bin file to lfs
joiskash Dec 31, 2023
ded9a03
Fix UT name
joiskash Dec 31, 2023
ec0ca73
Throw AIP exception for unknown strategy
joiskash Dec 31, 2023
6d4528e
Revert sound_record_tests.cpp changes
joiskash Dec 31, 2023
91fe148
Revert changes to vcpkg indentation and remove Whisper option
joiskash Dec 31, 2023
2021355
Linux -> OFF to ON Windows ON -> OFF
joiskash Jan 3, 2024
80500ce
Add reserve statement for vector
joiskash Jan 9, 2024
42ca754
update submodule for pipeline to run
joiskash Jan 9, 2024
66cd4d8
Update whisper port with install fix
joiskash Jan 13, 2024
e817f98
update submodule
joiskash Jan 13, 2024
ce3d6e2
Update vcpkg version
joiskash Jan 13, 2024
f33644f
Add changes to handle props change
joiskash Jan 13, 2024
b6e20df
Improve UT and refactor for changing sample strategy during run time.
joiskash Jan 13, 2024
925e508
Add apt-get install libx11-dev libgles2-mesa-dev for libepoxy error
joiskash Jan 13, 2024
1d7bc11
Add memory type check in validate input pins and throw exception if m…
joiskash Feb 15, 2024
bc04e47
update submodule
joiskash Feb 15, 2024
25090aa
Merge branch 'main' into kj/whisper-asr
joiskash Feb 15, 2024
0c56895
update vcpkg mysys2
joiskash Feb 15, 2024
969e844
update submodule
joiskash Feb 15, 2024
9f58b90
Address nits
joiskash Feb 16, 2024
1e738f6
Export env variable overlay port for building in arm64
joiskash Feb 16, 2024
d478555
added fix-for-arm64.patch for whisper
kushaljain-apra Feb 23, 2024
67cbe9a
update fix-vcpkg-json.ps1
kushaljain-apra Feb 23, 2024
6ddd487
update CMakeLists.txt
Feb 23, 2024
dba812f
update vcpkg url for build
joiskash Feb 23, 2024
4716f25
update whisper tests threshold
kushaljain-apra Feb 26, 2024
f494d88
update code formatting
kushaljain-apra Feb 26, 2024
ad0977b
update whisper test
kushaljain-apra Feb 26, 2024
42df5de
added EOS for small buffer size
kushaljain-apra Feb 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions base/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
cmake_minimum_required(VERSION 3.22)

OPTION(ENABLE_LINUX "Use this switch to enable LINUX" ON)
OPTION(ENABLE_LINUX "Use this switch to enable LINUX" OFF)
joiskash marked this conversation as resolved.
Show resolved Hide resolved
OPTION(ENABLE_CUDA "Use this switch to enable CUDA" ON)
OPTION(ENABLE_ARM64 "Use this switch to enable ARM64" OFF)
OPTION(ENABLE_WINDOWS "Use this switch to enable WINDOWS" OFF)
OPTION(ENABLE_WINDOWS "Use this switch to enable WINDOWS" ON)

set(VCPKG_INSTALL_OPTIONS "--clean-after-build")
set(VCPKG_OVERLAY_PORTS "${CMAKE_CURRENT_SOURCE_DIR}/../thirdparty/custom-overlay")
kumaakh marked this conversation as resolved.
Show resolved Hide resolved

IF(ENABLE_CUDA)
add_compile_definitions(APRA_CUDA_ENABLED)
ENDIF(ENABLE_CUDA)
Expand Down Expand Up @@ -38,8 +40,6 @@ project(APRAPIPES)
message(STATUS $ENV{PKG_CONFIG_PATH}">>>>>> PKG_CONFIG_PATH")

find_package(PkgConfig REQUIRED)


find_package(Boost COMPONENTS system thread filesystem serialization log chrono unit_test_framework REQUIRED)
find_package(JPEG REQUIRED)
find_package(OpenCV CONFIG REQUIRED)
Expand All @@ -50,6 +50,7 @@ find_package(FFMPEG REQUIRED)
find_package(ZXing CONFIG REQUIRED)
find_package(bigint CONFIG REQUIRED)
find_package(SFML COMPONENTS system window audio graphics CONFIG REQUIRED)
find_package(whisper CONFIG REQUIRED)

IF(ENABLE_CUDA)
if((NOT DEFINED CMAKE_CUDA_ARCHITECTURES) OR (CMAKE_CUDA_ARCHITECTURES STREQUAL ""))
Expand Down Expand Up @@ -280,10 +281,9 @@ SET(IP_FILES
src/OverlayFactory.h
src/OverlayFactory.cpp
src/TestSignalGeneratorSrc.cpp
src/AudioToTextXForm.cpp
)




SET(IP_FILES_H
include/HistogramOverlay.h
include/CalcHistogramCV.h
Expand All @@ -306,10 +306,9 @@ SET(IP_FILES_H
include/TextOverlayXForm.h
include/ColorConversionXForm.h
include/Overlay.h
include/AudioToTextXForm.h
)



SET(CUDA_CORE_FILES
src/apra_cudamalloc_allocator.cu
src/apra_cudamallochost_allocator.cu
Expand Down Expand Up @@ -561,6 +560,7 @@ SET(UT_FILES
test/mp4_dts_strategy_tests.cpp
test/overlaymodule_tests.cpp
test/testSignalGeneratorSrc_tests.cpp
test/audioToTextXform_tests.cpp
${ARM64_UT_FILES}
${CUDA_UT_FILES}
)
Expand Down Expand Up @@ -607,6 +607,7 @@ target_link_libraries(aprapipesut
liblzma::liblzma
bigint::bigint
sfml-audio
whisper::whisper
)

IF(ENABLE_WINDOWS)
Expand Down
50 changes: 50 additions & 0 deletions base/include/AudioToTextXForm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#pragma once

#include "Module.h"

// size of audio to process should be a parameter.
// Cache variable to collect frames for processing

class AudioToTextXFormProps : public ModuleProps
{
public:
enum DecoderSamplingStrategy {
GREEDY, //WHISPER_SAMPLING_GREEDY
BEAM_SEARCH //WHISPER_SAMPLING_BEAM_SEARCH
};
AudioToTextXFormProps(
kumaakh marked this conversation as resolved.
Show resolved Hide resolved
DecoderSamplingStrategy _samplingStrategy,
std::string _modelPath,
int _bufferSize) : samplingStrategy(_samplingStrategy),
modelPath(_modelPath),
bufferSize(_bufferSize)
{}
DecoderSamplingStrategy samplingStrategy;
std::string modelPath;
int bufferSize;
mraduldubey marked this conversation as resolved.
Show resolved Hide resolved
};

class AudioToTextXForm : public Module
{

public:
AudioToTextXForm(AudioToTextXFormProps _props);
virtual ~AudioToTextXForm();
bool init();
bool term();
void setProps(AudioToTextXFormProps& props);
AudioToTextXFormProps getProps();

protected:
bool process(frame_container& frames);
bool processSOS(frame_sp& frame);
bool validateInputPins();
bool validateOutputPins();
void addInputPin(framemetadata_sp& metadata, string& pinId);
bool handlePropsChange(frame_sp& frame);

private:
void setMetadata(framemetadata_sp& metadata);
class Detail;
boost::shared_ptr<Detail> mDetail;
};
3 changes: 2 additions & 1 deletion base/include/FrameMetadata.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ class FrameMetadata {
HEVC_DATA, //H265
MOTION_VECTOR_DATA,
OVERLAY_INFO_IMAGE,
FACE_LANDMARKS_INFO
FACE_LANDMARKS_INFO,
TEXT
};

enum MemType
Expand Down
3 changes: 3 additions & 0 deletions base/include/Mp4WriterSinkUtils.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#include <ctime>
#include <chrono>
#include <string>
#include <boost/filesystem.hpp>
kumaakh marked this conversation as resolved.
Show resolved Hide resolved

class Mp4WriterSinkUtils
{
Expand Down
180 changes: 180 additions & 0 deletions base/src/AudioToTextXForm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
#include "AudioToTextXForm.h"
#include "FrameMetadata.h"
#include "FrameMetadataFactory.h"
#include "Frame.h"
#include "Logger.h"
#include "Utils.h"
#include "whisper.h"
#include "SFML/Config.hpp"

class AudioToTextXForm::Detail
{
public:
Detail(AudioToTextXFormProps& _props) : mProps(_props)
{
}
~Detail() {}

void setProps(AudioToTextXFormProps& props)
{
mProps = props;
}

public:
framemetadata_sp mOutputMetadata;
std::string mOutputPinId;
std::vector<float> inputAudioBuffer;
yashrajsapra marked this conversation as resolved.
Show resolved Hide resolved
AudioToTextXFormProps mProps;
int mFrameType;
whisper_context *mWhisperContext = NULL;
yashrajsapra marked this conversation as resolved.
Show resolved Hide resolved
whisper_full_params mWhisperFullParams;
whisper_context_params mWhisperContextParams;
};

AudioToTextXForm::AudioToTextXForm(AudioToTextXFormProps _props) : Module(TRANSFORM, "AudioToTextXForm", _props)
{
mDetail.reset(new Detail(_props));
}

AudioToTextXForm::~AudioToTextXForm() {}

bool AudioToTextXForm::validateInputPins()
yashrajsapra marked this conversation as resolved.
Show resolved Hide resolved
{
if (getNumberOfInputPins() != 1)
{
LOG_ERROR << "<" << getId() << ">::validateInputPins size is expected to be 1. Actual<" << getNumberOfInputPins() << ">";
return false;
}

framemetadata_sp metadata = getFirstInputMetadata();

FrameMetadata::FrameType frameType = metadata->getFrameType();
if (frameType != FrameMetadata::AUDIO)
{
LOG_ERROR << "<" << getId() << ">::validateInputPins input frameType is expected to be Audio. Actual<" << frameType << ">";
return false;
}

return true;
}

bool AudioToTextXForm::validateOutputPins()
{
if (getNumberOfOutputPins() != 1)
{
LOG_ERROR << "<" << getId() << ">::validateOutputPins size is expected to be 1. Actual<" << getNumberOfOutputPins() << ">";
return false;
}

framemetadata_sp metadata = getFirstOutputMetadata();
FrameMetadata::FrameType frameType = metadata->getFrameType();
if (frameType != FrameMetadata::TEXT)
{
LOG_ERROR << "<" << getId() << ">::validateOutputPins input frameType is expected to be TEXT. Actual<" << frameType << ">";
return false;
}

return true;
}

void AudioToTextXForm::addInputPin(framemetadata_sp& metadata, string& pinId)
{
Module::addInputPin(metadata, pinId);
mDetail->mOutputMetadata = framemetadata_sp(new FrameMetadata(FrameMetadata::FrameType::TEXT));
mDetail->mOutputMetadata->copyHint(*metadata.get());
mDetail->mOutputPinId = addOutputPin(mDetail->mOutputMetadata);
}

bool AudioToTextXForm::init()
{
//intialize model
auto samplingStrategy = whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY;
switch (mDetail->mProps.samplingStrategy)
{
case AudioToTextXFormProps::DecoderSamplingStrategy::GREEDY:
samplingStrategy = whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY;
break;
case AudioToTextXFormProps::DecoderSamplingStrategy::BEAM_SEARCH:
samplingStrategy = whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH;
break;
default:
throw AIPException(AIP_FATAL, "Unknown Sampling Strategy");
}
mDetail->mWhisperFullParams = whisper_full_default_params(samplingStrategy);
mDetail->mWhisperContextParams = whisper_context_default_params();
mDetail->mWhisperContext = whisper_init_from_file_with_params(mDetail->mProps.modelPath.c_str(), mDetail->mWhisperContextParams);
return Module::init();
}

bool AudioToTextXForm::term()
{
whisper_free_context_params(&mDetail->mWhisperContextParams);
whisper_free_params(&mDetail->mWhisperFullParams);
whisper_free(mDetail->mWhisperContext);
return Module::term();
}

bool AudioToTextXForm::process(frame_container& frames)
{
auto frame = frames.begin()->second;
sf::Int16* constFloatPointer = static_cast<sf::Int16*>(frame->data());
int numberOfSamples = frame->size() / 2;
for (int index = 0; index < numberOfSamples; index++) {
mDetail->inputAudioBuffer.push_back((float)constFloatPointer[index]/ 32768.0f);
kumaakh marked this conversation as resolved.
Show resolved Hide resolved
}
if (mDetail->inputAudioBuffer.size() < mDetail->mProps.bufferSize) {
return true;
}
whisper_full(
mDetail->mWhisperContext,
mDetail->mWhisperFullParams,
mDetail->inputAudioBuffer.data(),
mDetail->inputAudioBuffer.size()
);
std::string output = "";
const int n_segments = whisper_full_n_segments(mDetail->mWhisperContext);
for (int i = 0; i < n_segments; ++i) {
const char* text = whisper_full_get_segment_text(mDetail->mWhisperContext, i);
output += text;
}
mDetail->inputAudioBuffer.clear();
kumaakh marked this conversation as resolved.
Show resolved Hide resolved
auto outFrame = makeFrame(output.length());
memcpy(outFrame->data(), output.c_str(), output.length());
frames.insert(make_pair(mDetail->mOutputPinId, outFrame));
send(frames);
return true;
}

void AudioToTextXForm::setMetadata(framemetadata_sp& metadata)
{
if (!metadata->isSet())
{
return;
}
}

bool AudioToTextXForm::processSOS(frame_sp& frame)
{
auto metadata = frame->getMetadata();
setMetadata(metadata);
return true;
}

AudioToTextXFormProps AudioToTextXForm::getProps()
{
fillProps(mDetail->mProps);
return mDetail->mProps;
}

bool AudioToTextXForm::handlePropsChange(frame_sp& frame)
{
AudioToTextXFormProps props(mDetail->mProps.samplingStrategy, mDetail->mProps.modelPath,32000);
auto ret = Module::handlePropsChange(frame, props);
mDetail->setProps(props);
return ret;
}

void AudioToTextXForm::setProps(AudioToTextXFormProps& props)
{
Module::addPropsToQueue(props);
}
59 changes: 59 additions & 0 deletions base/test/audioToTextXform_tests.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#include "stdafx.h"
#include <boost/test/unit_test.hpp>
mraduldubey marked this conversation as resolved.
Show resolved Hide resolved
#include "FrameMetadata.h"
#include "FrameMetadataFactory.h"
#include "Frame.h"
#include "Logger.h"
#include "test_utils.h"
#include "PipeLine.h"
#include "FileWriterModule.h"
#include "FileReaderModule.h"
#include "FileWriterModule.h"
#include "AudioToTextXForm.h"
#include "Module.h"
#include<iostream>
mraduldubey marked this conversation as resolved.
Show resolved Hide resolved
#include<fstream>
#include<vector>

BOOST_AUTO_TEST_SUITE(audioToTextXform_test)
mraduldubey marked this conversation as resolved.
Show resolved Hide resolved

BOOST_AUTO_TEST_CASE(test_asr, *boost::unit_test::enabled())
mraduldubey marked this conversation as resolved.
Show resolved Hide resolved
{
std::vector<std::string> asrOutText = { "./data/asr_out.txt" };
Test_Utils::FileCleaner f(asrOutText);

Logger::setLogLevel(boost::log::trivial::severity_level::info);

// This is a PCM file without WAV header
auto fileReaderProps = FileReaderModuleProps("./data/audioToTextXform_test.pcm");
fileReaderProps.readLoop = false;
auto fileReader = boost::shared_ptr<FileReaderModule>(new FileReaderModule(fileReaderProps));
auto metadata = framemetadata_sp(new FrameMetadata(FrameMetadata::AUDIO));
auto pinId = fileReader->addOutputPin(metadata);

auto asr = boost::shared_ptr<Module>(new AudioToTextXForm(AudioToTextXFormProps(
AudioToTextXFormProps::DecoderSamplingStrategy::GREEDY
,"./data/whisper/models/ggml-tiny.en-q8_0.bin",18000)));
fileReader->setNext(asr);

auto outputFile = boost::shared_ptr<Module>(new FileWriterModule(FileWriterModuleProps(asrOutText[0], false)));
asr->setNext(outputFile);

PipeLine p("test");
p.appendModule(fileReader);
p.init();
p.run_all_threaded();
boost::this_thread::sleep_for(boost::chrono::milliseconds(10000)); // giving time to call step
std::ifstream in_file_text(asrOutText[0]);
std::stringstream buffer;
mraduldubey marked this conversation as resolved.
Show resolved Hide resolved
buffer << in_file_text.rdbuf();
BOOST_TEST(
(buffer.str() == " The Matic speech recognition also known as ASR is the use of machine learning"
"or artificial intelligence technology to process human speech into readable text."));
p.stop();
mraduldubey marked this conversation as resolved.
Show resolved Hide resolved
p.term();
p.wait_for_all();
in_file_text.close();
}

BOOST_AUTO_TEST_SUITE_END()
Loading
Loading