From 421b839920fd7d02a819b8ed962aa81f52c5d685 Mon Sep 17 00:00:00 2001 From: kuro337 Date: Fri, 19 Jan 2024 20:07:57 -0500 Subject: [PATCH] Initial Commit --- .gitignore | 5 + CMakeLists.txt | 117 +++++++ assets/logo.png | Bin 0 -> 3426 bytes benchmarks/cache_benchmark.cc | 155 ++++++++++ readme.md | 179 +++++++++++ tests/ocr_test.cc | 105 +++++++ tests/similarity_test.cc | 40 +++ textract.h | 564 ++++++++++++++++++++++++++++++++++ 8 files changed, 1165 insertions(+) create mode 100644 .gitignore create mode 100644 CMakeLists.txt create mode 100644 assets/logo.png create mode 100644 benchmarks/cache_benchmark.cc create mode 100644 readme.md create mode 100644 tests/ocr_test.cc create mode 100644 tests/similarity_test.cc create mode 100644 textract.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f963338 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.cache +main.cc +build +read.md +images diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..3275206 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,117 @@ +cmake_minimum_required(VERSION 3.20) +project(opencvOCR) + +# change standard as required +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) + + +find_package(OpenCV REQUIRED) + +find_package(OpenSSL REQUIRED) + +find_package(OpenMP) + +find_package(Folly CONFIG REQUIRED) + +find_package(gflags CONFIG REQUIRED) + +find_package(CURL REQUIRED) + +include_directories("/opt/homebrew/opt/tesseract/include") +link_directories("/opt/homebrew/opt/tesseract/lib") + +# gtest +include(FetchContent) +FetchContent_Declare( + googletest + URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip + DOWNLOAD_EXTRACT_TIMESTAMP TRUE +) +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) # windows setting +FetchContent_MakeAvailable(googletest) + + +# main + +add_executable( + main + main.cc +) + +target_link_libraries(main + PUBLIC GTest::gtest_main + PUBLIC Folly::folly + PUBLIC OpenSSL::Crypto + PUBLIC tesseract + PUBLIC ${OpenCV_LIBS} + PUBLIC ${CURL_LIBRARIES} +) + +if(OpenMP_CXX_FOUND) + message(STATUS "Using OpenMP") + target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX) +else() + message(STATUS "OpenMP not found") + +endif() + +# tests + + +enable_testing() + +add_executable(ocr_test tests/ocr_test.cc) +add_executable(similarity_test tests/similarity_test.cc) +add_executable(cache_benchmark benchmarks/cache_benchmark.cc) + +target_link_libraries( + cache_benchmark + PUBLIC GTest::gtest_main + PUBLIC ${CURL_LIBRARIES} + PUBLIC ${OpenCV_LIBS} + PUBLIC OpenSSL::Crypto + PUBLIC tesseract + PUBLIC Folly::folly + PUBLIC Folly::follybenchmark +) + + +target_link_libraries( + ocr_test + PUBLIC GTest::gtest_main + PUBLIC ${CURL_LIBRARIES} + PUBLIC ${OpenCV_LIBS} + PUBLIC OpenSSL::Crypto + PUBLIC tesseract + PUBLIC Folly::folly +) + +target_link_libraries( + similarity_test + PUBLIC GTest::gtest_main + PUBLIC ${CURL_LIBRARIES} + PUBLIC ${OpenCV_LIBS} + PUBLIC OpenSSL::Crypto + PUBLIC tesseract + PUBLIC Folly::folly +) + + + +include(GoogleTest) +gtest_discover_tests(ocr_test) +gtest_discover_tests(similarity_test) + + +if(OpenMP_CXX_FOUND) + message(STATUS "Using OpenMP") + target_link_libraries(ocr_test PUBLIC OpenMP::OpenMP_CXX) + target_link_libraries(similarity_test PUBLIC OpenMP::OpenMP_CXX) +else() + message(STATUS "OpenMP not found") + +endif() + diff --git a/assets/logo.png b/assets/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..29ff5285166d94d516e58e2be21c62319657425c GIT binary patch literal 3426 zcmZWs2{;s5-#@c3p+Qvk8Ol=jk|pazLL|{;Y(XhIng z$ug2$j50N{XK5^B4Boll`+VR1-se5fIp_a8=lsuq`JLZ+p8rX+v$a6-OY#E%kXDwb z?YR>F_t?YBo!gB)d$@ux#L_t&00Ggz2Lu#66z3}Mw>o|5V)VnGkFJMY{4CRzcMH?O z5A9YQwQ!N)tNi0g)>(97gns}n-^IZHw=oyj?kS{v4o1^3R~BMQk(VEp&q46=T9>EG zmcnVa^JVFWmH)Jj2QnqHj6hqoEqcY(n3i%xvb$yhwzEaV^DSS9ql$ylb^GA;VSL zc{*zowHm?E6`Mc3>ZZC67$pA=akr4)DCm$mSG zLgbSihfuDOg7I|n2}sQjK4lrc5IQCzFu1PemuAExC37htB|CV{(z=&fDr6rE_7r&S zT|m{4C;JjN&7k=eeYcqD=6z3@#wmI&|4Yqk{(Hv9+mNp7mSL#+7_nVj93+y z!*=?xqW>}}2oz5!xZ06iUe@W&SiUzf`G*i1k8Ed}qcdKdH1is7_k`hby9rHlg2XZ^ z@z~XK^6HI@ z{EQlgv^evx{%J#?wWcn`h#o2qW;S3pdT09APYF;`X<(;bj6P%ig(}n#i5u4yooS>k zhsC|BQ*Sf*epM7?#L1VQ+AhN{vW`)%H-KFOIM~mtCIV-z)?zNJ@s53OP-hHR0faKd zHc^iiXvBXFNGCu_`Y_~2cX$XoJ91iJa?r(%;;;LCbjyX0GJ3ev8p(I_25!Qa5Oj~_ z!(l`%_kQhBLf?x$NdMwOsCIIm(2$%v<15LFA*E@^Yp-~e&wkAZmy7653(2O!Glf2w zxif{-k(@oN9-`t+k`fo{L~C6XF#Zl&Tm4&aF<4->JV>S{O*QMz}eSWrLVk$ z`fbsjYP$K&LxsXdV}is@*I|gAXzPF`AjVFLdK;-zXw`n$1&e)0*Ht1>i05m=ZMe9p zE$~5XUw`yoOWOXO_<58cC(kUTtbvnivX9=YbVsex!o6boX^~DyXZ6c(%ja)*kE|Tc zsZeAh2(gor3~=$C)1A2zqomPGKrw)jfH$H}~U-L=d zi%$q*BIC5F!mF6-kt@`+gcbp6dXJfR$IFt`jX$y~Zlfw00$h#qd=*Np{kxfO^m7hP z6C~Q{h+yB^j8&iF8SJ#VcHB{N+WCzp1}Ot(Cvn+KIn+z=@z0Z`D%(xV)k-T$#Z>Hc z$6pg(r4I4uo@!m%W~h=fVat=1F7|0l=TAOpoK9b?&zm7xQ>VAWOJ?+5cjz-doAigh z=Hi+8q#c=*I(umgLYN43%KXS|n@&AlF{>~vQM(g*U)#3^?8qTOdLa3+6gam}raVs% zW`~o;H{@+~>$kX0d9cHZW{l0!PuLDHM@7IwhSP4~eG$CyU9cBUafxTt1-SxA#hJ}O z0Z_4{0OBp7185K+j86{+LZ9*i`7!YYMb!Ji1J}aF)t^96BpX9njcQ=$gi&!2gg(>? zCkR;qHjE$*IntGR^1qae~R*4Pz_f3z}J+&8ER+J32UTs99-+P$f&ALCPU9d$2I zaTWmkZ;OHyWw?j7I;CyWybx+P2bzFq%ozDVOOQ@zWAca5#xgNDe*Wwy^@D96E z;D{M;%;H5UHV5%j*jE2Ivxf?sir|yLk#-)vkztTKDh_P7tpPg+PA~zE87l!Qoj~ZG znD6aHE#iy0|2nt>uvx$`NgzaX)W-olQJmcjjxQ9^{%W zgwxM*JKghnKo%F@v!oVBD(*a(3Yjg+?;F$uJ;Hc$&*sBx{k;Hd!c-kXu9VX zR8sk|d7Qowrt1~J2$!LwstQ0Zq8r%V_28J5-VhL-6Nb_~Y0$b~y#FSUNz-ue0fM6* z;DTpF7I;rWV%BTm&%GeO)54Z{{fu++?!#gN1O^z0bBTL{uppuc%BFsaRsT7=cfbV& zs#nwpT+j>s^y4#)~Wv|~v34g?+Mg(1$+tFS>B;($^Lk-L=ndaz%h&UaxyC;{c zLI;(^cZqh57l?pRjP9&&#-hE+TZ40nDY7j(m3K1%X5u>PMLMm0sZ5^d#kp1qVaFPv zYNNU1+}6g!_qpxLne(hg0T4Uq)e`5+@KH|gNqFIRV&O*QL8(KR zXOh(mQ=;eDToDIk>ORtbd4zfD%iecRjBSj|2J>#(cP7@`=jwBQYw2St5D$Ciy{t+e zZVhr`%ZpoO*z?<8x*%YW(T~m43e**%Jj~VhH&&OIY3P)wIW9z5$VL2Jfsldx_TUMi z*^_?@ej!p&T~Q{|xBliUE89;%v#Q={qY8M+zK~ zPE08cBaj|20o9JlG`zWN!4zSKOV~ zS7ghUZ;2<=%-^mVCz4bp60(P+m(-G^sm=;ru)+0E*{sp_I`Q@0e5iH~Lo_qo!UNnWj!0-d4MkVqlc>%_yl2Gp1jk z_1Px){_JJub9;Jo?A{bNiTBG754sQ8)hC4cMCYCQs$FJIjln$25!W-JGZ%%tiS`PI z4-hk@3>NI;s+vQWh1!E#B{X7d9!Kaf{VTpRzhv+4Erz z6Z<6bh +#include +#include +#include + +/* + +1 second: 1 +1 millisecond (ms): 0.001 seconds +1 microsecond (us): 0.000001 seconds +1 nanosecond (ns): 0.000000001 seconds + + +*/ + +void mutexMapBenchmark(int threadCount, int insertionsPerThread) { + std::unordered_map map; + std::mutex mapMutex; + + std::vector threads; + for (int i = 0; i < threadCount; ++i) { + threads.emplace_back([&map, &mapMutex, i, insertionsPerThread] { + for (int j = 1; j <= insertionsPerThread; ++j) { + int key = i * insertionsPerThread + j; + std::string value = "Value " + std::to_string(key); + + std::lock_guard lock(mapMutex); + map[key] = std::move(value); + } + }); + } + + for (auto &t : threads) { + t.join(); + } +} + +void concurrentMapBenchmark(int threadCount, int insertionsPerThread) { + folly::ConcurrentHashMap map; + + std::vector threads; + for (int i = 0; i < threadCount; ++i) { + threads.emplace_back([&map, i, insertionsPerThread] { + for (int j = 0; j < insertionsPerThread; ++j) { + map.insert_or_assign(j, "Value " + std::to_string(i * 100 + j)); + } + }); + } + + for (auto &t : threads) { + t.join(); + } +} + +void concurrentMapBenchmarkComplex(int threadCount, int insertionsPerThread) { + auto v = std::vector{"Complex", "Data", "Type"}; + + folly::ConcurrentHashMap> map; + + std::vector threads; + for (int i = 0; i < threadCount; ++i) { + threads.emplace_back([&map, &v, i, insertionsPerThread] { + for (int j = 1; j <= insertionsPerThread; ++j) { + std::string key = "Key" + std::to_string(i * insertionsPerThread + j); + map.insert_or_assign(std::move(key), v); + } + }); + } + + for (auto &t : threads) { + t.join(); + } +} + +void atomicMapBenchmark(int threadCount, int insertionsPerThread) { + folly::AtomicUnorderedInsertMap atomicMap( + threadCount * insertionsPerThread); + + std::vector threads; + for (int i = 0; i < threadCount; ++i) { + threads.emplace_back([&atomicMap, i, insertionsPerThread] { + for (int j = 1; j <= insertionsPerThread; ++j) { + int key = i * insertionsPerThread + j; + atomicMap.emplace(key, "Value " + std::to_string(key)); + } + }); + } + + for (auto &t : threads) { + t.join(); + } +} + +void atomicMapBenchmarkComplex(int threadCount, int insertionsPerThread) { + auto v = std::vector{"Complex", "Data", "Type"}; + + folly::AtomicUnorderedInsertMap> + atomicMap(threadCount * insertionsPerThread); + + std::vector threads; + for (int i = 0; i < threadCount; ++i) { + threads.emplace_back([&atomicMap, &v, i, insertionsPerThread] { + for (int j = 1; j <= insertionsPerThread; ++j) { + std::string key = "Key" + std::to_string(i * insertionsPerThread + j); + atomicMap.emplace(std::move(key), v); + } + }); + } + + for (auto &t : threads) { + t.join(); + } +} + +BENCHMARK(UnorderedMapMutexedSingleThreaded, n) { mutexMapBenchmark(1, n); } +BENCHMARK(UnorderedMapMutexedMultiThreaded, n) { mutexMapBenchmark(5, n); } +BENCHMARK(UnorderedMapMutexedMaxThreads, n) { mutexMapBenchmark(12, n); } + +BENCHMARK(ConcurrentHashMapSingleThreaded, n) { concurrentMapBenchmark(1, n); } +BENCHMARK(ConcurrentHashMapMultiThreaded, n) { concurrentMapBenchmark(5, n); } +BENCHMARK(ConcurrentHashMapMaxThreads, n) { concurrentMapBenchmark(12, n); } + +BENCHMARK(ConcurrentHashMapComplexSingleThreaded, n) { + concurrentMapBenchmarkComplex(1, n); +} + +BENCHMARK(ConcurrentHashMapComplexMultiThreaded, n) { + concurrentMapBenchmarkComplex(5, n); +} + +BENCHMARK(ConcurrentHashMapComplexMaxThreads, n) { + concurrentMapBenchmarkComplex(12, n); +} + +BENCHMARK(AtomicUnorderedMapSingleThreaded, n) { atomicMapBenchmark(1, n); } +BENCHMARK(AtomicUnorderedMapMultiThreaded, n) { atomicMapBenchmark(5, n); } +BENCHMARK(AtomicUnorderedMapMaxThreads, n) { atomicMapBenchmark(12, n); } + +BENCHMARK(AtomicUnorderedMapComplexSingleThreaded, n) { + atomicMapBenchmarkComplex(1, n); +} + +BENCHMARK(AtomicUnorderedMapComplexMultiThreaded, n) { + atomicMapBenchmarkComplex(5, n); +} + +BENCHMARK(AtomicUnorderedMapComplexMaxThreads, n) { + atomicMapBenchmarkComplex(12, n); +} + +int main(int argc, char *argv[]) { + folly::Init init(&argc, &argv); + folly::runBenchmarks(); + return 0; +} \ No newline at end of file diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..e1303df --- /dev/null +++ b/readme.md @@ -0,0 +1,179 @@ +
+
+ +
+ textract logo +
+ +
+
+ +# textract + +
+ +_Single Header High Performance_ **C++ Image Processing** Library to read content from Images and transform Images to text files. + +
+ +
+ +
+ +Build from Source using **CMake** + +#### Dependencies + +
+ +```bash + +brew install opencv openssl libomp folly tesseract + +``` + +
+ +#### Build + +
+ +```bash + +cd textract && mkdir build && cd build +cmake .. +make + +# using LLVM and Clang++ directly +cmake -DCMAKE_CXX_COMPILER=/path/to/clang++ -DCMAKE_C_COMPILER=/path/to/clang .. +make + +# getting clang++ and clang paths +echo $(brew --prefix llvm)/bin/clang++ +echo $(brew --prefix llvm)/bin/clang + +``` + +
+ +## Design + +
+ +#### OpenCV and Tesseract + +For Processing images and using _Tesseract OCR_ to extract text from Images. + +
+ +#### OpenSSL + +For generating _SHA256_ hashes from Image bytes and metadata. + +
+ +#### OpenMP + +To provide _parallelization_ on systems for processing. + +
+ +#### Folly + +_textract_ uses Folly's **AtomicUnorderedInsertMap** for the Cache implementation to provide wait free parallel access to the Cache + +[Folly](https://github.com/facebook/folly)::[AtomicUnorderedInsertMap](https://github.com/facebook/folly/blob/main/folly/AtomicUnorderedMap.h) + +
+ +
+ +## Usage + +
+ +Process Images and get their textual content + +
+ +```cpp +#include "imgtotext.h" + +int main() { + imgstr::ImageTranslator app = imgstr::ImageTranslator(); + + std::vector results = app.processImages("cs101_notes.png","bio.jpeg"); + + app.writeImageTextOut("cs101_notes.png", "cs_notes.txt"); + + return 0; +} + +``` + +
+ +Process all valid Image files from a directory and create text files + +
+ +```cpp +#include "imgtotext.h" + +int main() { + + /* Process 10000 images using parallelism */ + + imgstr::ImageTranslator app = imgstr::ImageTranslator(10000); + + + app.processImagesWriteResults("/path/to/dir"); + + + return 0; +} + +``` + +
+ +
+ +### In Memory Cache Benchmarks + +
+ +```bash +============================================================================ +/textract/benchmarks/cache_benchmark.cc relative time/iter iters/s +============================================================================ +UnorderedMapMutexedSingleThreaded 254.95ns 3.92M +UnorderedMapMutexedMultiThreaded 3.23us 309.19K +UnorderedMapMutexedMaxThreads 7.28us 137.27K +ConcurrentHashMapSingleThreaded 859.52ns 1.16M +ConcurrentHashMapMultiThreaded 3.41us 293.37K +ConcurrentHashMapMaxThreads 26.40us 37.87K +ConcurrentHashMapComplexSingleThreaded 1.43us 700.82K +ConcurrentHashMapComplexMultiThreaded 4.81us 207.69K +ConcurrentHashMapComplexMaxThreads 32.92us 30.38K +AtomicUnorderedMapSingleThreaded 159.61ns 6.27M +AtomicUnorderedMapMultiThreaded 403.47ns 2.48M +AtomicUnorderedMapMaxThreads 1.63us 611.78K +AtomicUnorderedMapComplexSingleThreaded 917.79ns 1.09M +AtomicUnorderedMapComplexMultiThreaded 2.44us 409.85K +AtomicUnorderedMapComplexMaxThreads 12.62us 79.25K +============================================================================ + +``` + +
+ +It can be seen **AtomicUnorderedInsertMap** is over **8x** faster than the Concurrent HashMap. + +[AtomicUnorderedInsertMap](https://github.com/facebook/folly/blob/main/folly/AtomicUnorderedMap.h) provides an overview of the tradeoffs. + +
+ +
+ +Author: [kuro337](https://github.com/kuro337) diff --git a/tests/ocr_test.cc b/tests/ocr_test.cc new file mode 100644 index 0000000..9ba439f --- /dev/null +++ b/tests/ocr_test.cc @@ -0,0 +1,105 @@ +#include "../textract.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace imgstr; + +class MyTestSuite : public ::testing::Test { +public: + const std::string tempDir = "tmp"; + const std::string path = "../../images/"; + const std::string inputFile = "../../images/imgtext.jpeg"; + const std::vector images = {"screenshot.png", "imgtext.jpeg", + "compleximgtext.png", + "scatteredtext.png"}; + ImgProcessor imageTranslator = ImgProcessor(); + +protected: + void SetUp() override { ::testing::internal::CaptureStderr(); } + + void TearDown() override { + + std::string captured_stdout_ = ::testing::internal::GetCapturedStderr(); + + std::filesystem::remove_all(tempDir); + } +}; + +TEST_F(MyTestSuite, EnvironmentTest) { EXPECT_NO_THROW(printSystemInfo()); } + +TEST_F(MyTestSuite, ConvertImageToTextFile) { + imageTranslator.convertImageToTextFile(inputFile, tempDir); + bool fileExists = std::filesystem::exists(tempDir + "/" + "imgtext.txt"); + ASSERT_TRUE(fileExists); +} + +TEST_F(MyTestSuite, WriteFileTest) { + + std::vector paths; + std::transform(images.begin(), images.end(), std::back_inserter(paths), + [&](const std::string &img) { return path + img; }); + + EXPECT_NO_THROW(imageTranslator.addFiles(paths)); + EXPECT_NO_THROW(imageTranslator.convertImagesToTextFiles("tmp")); + + std::vector test_lengths = {20, 1, 2, 9}; + + for (size_t i = 0; i < images.size(); ++i) { + std::size_t lastDot = images[i].find_last_of('.'); + std::string expectedOutputPath = + tempDir + "/" + images[i].substr(0, lastDot) + ".txt"; + + bool fileExists = std::filesystem::exists(expectedOutputPath); + ASSERT_TRUE(fileExists); + + std::ifstream outputFile(expectedOutputPath); + ASSERT_TRUE(outputFile.is_open()); + + int lineCount = 0; + std::string line; + while (std::getline(outputFile, line)) + lineCount++; + + outputFile.close(); + EXPECT_GE(lineCount, test_lengths[i]); + } +} + +TEST_F(MyTestSuite, BasicAssertions) { + tesseract::TessBaseAPI ocr; + + for (int i = static_cast(ISOLang::en); + i <= static_cast(ISOLang::de); ++i) { + ISOLang lang = static_cast(i); + const char *langStr = isoToTesseractLang(lang).c_str(); + + EXPECT_NO_THROW((ocr.Init(nullptr, langStr))); + } + + try { + ocr.Init(nullptr, "nonexistent"); + } catch (const std::exception &e) { + + std::cout + << "To enable all languages for Tesseract - make sure pack is " + "installed.\nhttps://github.com/dataiku/dss-plugin-tesseract-ocr/" + "tree/v1.0.2#specific-languages\n" + << std::endl; + + EXPECT_STRNE(e.what(), "Please make sure the TESSDATA_PREFIX environment " + "variable is set to your \"tessdata\" directory."); + } + + EXPECT_STRNE("hello", "world"); + EXPECT_EQ(7 * 6, 42); +} + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tests/similarity_test.cc b/tests/similarity_test.cc new file mode 100644 index 0000000..6917466 --- /dev/null +++ b/tests/similarity_test.cc @@ -0,0 +1,40 @@ +#include "../textract.h" +#include + +using namespace imgstr; + +TEST(SimilaritySuite, SingleString) { + + using namespace std; + + string a = "intention"; + string b = "execution"; + + EXPECT_EQ(levenshteinScore(a, b), 5); +} + +TEST(SimilaritySuite, ImageSHA256Equal) { + + using namespace std; + const string path = "../../images/"; + string file_a = path + "screenshot.png"; + string file_b = path + "dupescreenshot.png"; + + string sha_a = computeSHA256(file_a); + string sha_b = computeSHA256(file_b); + + EXPECT_EQ(sha_a, sha_b); +} + +TEST(SimilaritySuite, ImageSHA256Unequal) { + + using namespace std; + const string path = "../../images/"; + string file_a = path + "screenshot.png"; + string file_b = path + "imgtext.jpeg"; + + string sha_a = computeSHA256(file_a); + string sha_b = computeSHA256(file_b); + + EXPECT_NE(sha_a, sha_b); +} \ No newline at end of file diff --git a/textract.h b/textract.h new file mode 100644 index 0000000..7d4e376 --- /dev/null +++ b/textract.h @@ -0,0 +1,564 @@ + +#ifndef TEXTRACT_H +#define TEXTRACT_H + +#include "opencv2/imgcodecs.hpp" +#include "opencv2/imgproc.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace imgstr { + +#pragma region TEXT_SIMILARITY /* Text Similarity Declarations */ + +size_t levenshteinScore(std::string a, std::string b); + +#pragma endregion + +#pragma region CRYPTOGRAPHY /* Cryptography Declarations */ + +std::string computeSHA256(const std::vector &data); + +std::string computeSHA256(const std::string &filePath); + +#pragma endregion + +#pragma region SYSTEM_UTILS /* System Environment helpers */ + +void printSystemInfo(); + +const std::string delimiter = + "\x1b[90m***********************************************************"; +const std::string check_mark = "\x1b[32m✔\x1b[0m"; +const std::string green = "\x1b[92m"; +const std::string end = "\x1b[0m"; +const std::string yellow = "\x1b[93m"; + +#pragma endregion + +#pragma region FILE_IO /* File IO helpers */ + +std::vector readBytesFromFile(const std::string &filename); + +void writeToNewFile(const std::string &content, const std::string &output_path); + +#pragma endregion + +#pragma region OPENCV_UTILS /* OpenCV Declarations */ + +enum class ISOLang { en, es, fr, hi, zh, de }; + +inline std::string isoToTesseractLang(ISOLang isoLang) { + switch (isoLang) { + case ISOLang::en: + return "eng"; + case ISOLang::es: + return "spa"; + case ISOLang::fr: + return "fra"; + case ISOLang::de: + return "deu"; + case ISOLang::zh: + return "chi_sim"; + case ISOLang::hi: + return "hin"; + default: + return "eng"; + } +} + +std::string extractTextFromImageBytes(const std::vector &file_content, + const std::string &lang); + +std::string extractTextFromImageFile(const std::string &file_path, + const std::string &lang); + +std::string extractTextFromImageFile(const std::string &file_path, + ISOLang lang); + +#pragma endregion + +/* + +ImgProcessor : Core header class + +Provides an efficient, high performance implementation of Text +Extraction from Images. + +Supports Parallelized Image Processing and maintains an in-memory cache. + +Uses an Atomic Unordered Map for Safe Wait-Free parallel access to ensure images +are not processed twice + +Cache retrieval logic is determined by the SHA256 hash of the Image bytes + +The SHA256 Byte Hash enables duplicate images to not be processed even if the +file names or paths differ. + +*/ + +#pragma region imgstr_core /* Core Class for Image Processing and Text Extraction */ + +struct Image { + std::string path; + std::string name; + std::size_t byte_size; + std::string text_content; + std::string content_fuzzhash; + std::string image_sha256; +}; + +class ImgProcessor { + +private: + std::string dir; + + std::vector files; + + folly::AtomicUnorderedInsertMap cache; + + std::optional getFromCacheIfExists(const std::string &img_sha) { + auto text_from_cache = cache.find(img_sha); + + if (text_from_cache != cache.end()) { + return text_from_cache->second; + } + return std::nullopt; + } + + std::vector processCurrentFiles() { + std::vector processedText; + if (files.empty()) { + std::cout << "Files are empty" << std::endl; + return {}; + } + for (const auto &file : files) { + + try { + auto data = readBytesFromFile(file); + std::string img_hash = computeSHA256(data); + auto text_from_cache = getFromCacheIfExists(img_hash); + + if (text_from_cache) { + printCacheHit(file); + processedText.emplace_back(*text_from_cache); + } else { + std::string img_text = extractTextFromImageBytes(data, "eng"); + cache.emplace(img_hash, img_text); + processedText.emplace_back(img_text); + } + + } catch (const std::exception &e) { + std::cout << "Failed to Extract Text from Image file: " << file + << ". Error: " << e.what() << '\n'; + } + } + + return processedText; + } + + void printCacheHit(const std::string &file) { + std::cout << delimiter << '\n' + << check_mark << green << " Image Already Processed : " << end + << file << '\n'; + } + +public: + ImgProcessor(size_t capacity = 1000) + : files(), + cache(folly::AtomicUnorderedInsertMap( + capacity)) {} + + void setDir(const std::string dir_path) { dir = dir_path; } + + void addFile(const std::string &file_path) { files.push_back(file_path); } + + void resetCache(size_t new_capacity) { + cache = + folly::AtomicUnorderedInsertMap(new_capacity); + } + + template + std::vector processImages(FileNames... fileNames) { + addFiles({fileNames...}); // Use initializer_list to unpack the variadic + + return processCurrentFiles(); + } + + void addFiles(std::initializer_list fileList) { + for (const auto &file : fileList) { + this->files.push_back(file); + } + } + + void addFiles(const std::vector &fileList) { + for (const auto &file : fileList) { + files.push_back(file); + } + } + + void printFiles() { + for (const auto &file : files) { + std::cout << file << std::endl; + } + } + + // pass file path , get Text + std::string getImageText(const std::string &file_path, + ISOLang lang = ISOLang::en) { + try { + + // if already processed - return else process image and add to cache + auto data = readBytesFromFile(file_path); + std::string img_hash = computeSHA256(data); + auto text_from_cache = getFromCacheIfExists(img_hash); + + if (text_from_cache) { + printCacheHit(file_path); + return *text_from_cache; + } else { + std::string img_text = + extractTextFromImageBytes(data, isoToTesseractLang(lang)); + cache.emplace(img_hash, img_text); + + return img_text; + } + } catch (const std::exception &e) { + std::cout << "Failed to Extract Text from Image file: " << file_path + << ". Error: " << e.what() << '\n'; + } + + return nullptr; + } + + void writeTextToFile(const std::string &content, + const std::string &output_path) { + + if (std::filesystem::exists(output_path)) { + std::cerr << "Error: File already exists - " << output_path << std::endl; + return; + } + + std::ofstream outFile(output_path); + if (!outFile) { + std::cerr << "Error opening file: " << output_path << std::endl; + return; + } + outFile << content; + } + + void convertImageToTextFile(const std::string &input_file, + const std::string &output_dir = "", + ISOLang lang = ISOLang::en) { +#ifdef _WIN32 + std::string path_separator = "\\"; +#else + std::string path_separator = "/"; +#endif + + // create output dir if not exists + if (!output_dir.empty() && !std::filesystem::exists(output_dir)) { + std::filesystem::create_directories(output_dir); + } + std::string outputFilePath = output_dir; + if (!output_dir.empty() && output_dir.back() != path_separator.back()) { + outputFilePath += path_separator; + } + + std::size_t lastSlash = input_file.find_last_of("/\\"); + std::size_t lastDot = input_file.find_last_of('.'); + std::string filename = + input_file.substr(lastSlash + 1, lastDot - lastSlash - 1); + outputFilePath += filename + ".txt"; + + std::string converted = getImageText(input_file, lang); + writeTextToFile(converted, outputFilePath); + } + + void convertImagesToTextFiles(const std::string &output_dir = "", + ISOLang lang = ISOLang::en) { +#ifdef _WIN32 + std::string path_separator = "\\"; +#else + std::string path_separator = "/"; +#endif + + // create output dir if not exists + if (!output_dir.empty() && !std::filesystem::exists(output_dir)) { + std::filesystem::create_directories(output_dir); + } + +#pragma omp parallel for + for (const auto &file : files) { + // int threads = omp_get_thread_num(); + // cout << "Open MP thread num " << threads << endl; + + std::string outputFilePath = output_dir; + if (!output_dir.empty() && output_dir.back() != path_separator.back()) { + outputFilePath += path_separator; + } + + std::size_t lastSlash = file.find_last_of("/\\"); + std::size_t lastDot = file.find_last_of('.'); + std::string filename = + file.substr(lastSlash + 1, lastDot - lastSlash - 1); + outputFilePath += filename + ".txt"; + + std::string converted = getImageText(file, lang); + writeTextToFile(converted, outputFilePath); + } + } +}; + +#pragma endregion + +/* Declaration Implementations */ + +#pragma region TEXT_SIMILARITY_IMPL + +inline size_t levenshteinScore(std::string a, std::string b) { + using namespace std; + + size_t m = a.size(); + size_t n = b.size(); + + vector> dp(m + 1, vector(n + 1)); + + for (size_t i = 1; i < m; i++) + dp[i][0] = i; + + for (size_t i = 1; i < n; i++) + dp[0][i] = i; + + for (size_t i = 1; i <= m; i++) { + for (size_t j = 1; j <= n; j++) { + size_t eq = a[i - 1] == b[j - 1] ? 0 : 1; + + dp[i][j] = std::min(dp[i - 1][j - 1] + eq, + std::min(dp[i - 1][j] + 1, dp[i][j - 1] + 1)); + } + } + + return dp[m][n]; + + return 1; +} + +#pragma endregion + +#pragma region CRYPTOGRAPHY_IMPL +#include +#include +#include +#include +#include + +inline std::string computeSHA256(const std::vector &data) { + EVP_MD_CTX *mdContext = EVP_MD_CTX_new(); + if (mdContext == nullptr) { + throw std::runtime_error("Failed to create EVP_MD_CTX"); + } + + if (EVP_DigestInit_ex(mdContext, EVP_sha256(), nullptr) != 1) { + EVP_MD_CTX_free(mdContext); + throw std::runtime_error("Failed to initialize EVP Digest"); + } + + if (EVP_DigestUpdate(mdContext, data.data(), data.size()) != 1) { + EVP_MD_CTX_free(mdContext); + throw std::runtime_error("Failed to update digest"); + } + + unsigned char hash[EVP_MD_size(EVP_sha256())]; + unsigned int lengthOfHash = 0; + + if (EVP_DigestFinal_ex(mdContext, hash, &lengthOfHash) != 1) { + EVP_MD_CTX_free(mdContext); + throw std::runtime_error("Failed to finalize digest"); + } + + EVP_MD_CTX_free(mdContext); + + std::stringstream ss; + for (unsigned int i = 0; i < lengthOfHash; ++i) { + ss << std::hex << std::setw(2) << std::setfill('0') << (int)hash[i]; + } + return ss.str(); +} + +inline std::string computeSHA256(const std::string &filePath) { + std::ifstream file(filePath, std::ifstream::binary); + if (!file) { + throw std::runtime_error("Could not open file: " + filePath); + } + + std::vector data(std::istreambuf_iterator(file), {}); + + return computeSHA256(data); +} +#pragma endregion + +#pragma region OPENCV_IMPL +inline std::string +extractTextFromImageBytes(const std::vector &file_content, + const std::string &lang = "eng") { + + cv::Mat img = cv::imdecode(file_content, cv::IMREAD_COLOR); + if (img.empty()) { + throw std::runtime_error("Failed to load image from buffer"); + } + + cv::Mat gray; + cv::cvtColor(img, gray, cv::COLOR_BGR2GRAY); + cv::threshold(gray, gray, 0, 255, cv::THRESH_BINARY | cv::THRESH_OTSU); + + tesseract::TessBaseAPI ocr; + if (ocr.Init(nullptr, lang.c_str()) != 0) { + throw std::runtime_error("Could not initialize tesseract."); + } + + ocr.SetImage(gray.data, gray.cols, gray.rows, 1, gray.step); + std::string outText(ocr.GetUTF8Text()); + ocr.End(); + + return outText; +} + +inline std::string extractTextFromImageFile(const std::string &file_path, + const std::string &lang) { + + cv::Mat img = cv::imread(file_path); + if (img.empty()) { + throw std::runtime_error("Failed to load image: " + file_path); + } + cv::Mat gray; + cv::cvtColor(img, gray, cv::COLOR_BGR2GRAY); + cv::threshold(gray, gray, 0, 255, cv::THRESH_BINARY | cv::THRESH_OTSU); + + // Init Tesseract OCR engine + // brew install tesseract-lang for all langs + tesseract::TessBaseAPI ocr; + if (ocr.Init(nullptr, lang.c_str()) != 0) { + throw std::runtime_error("Could not initialize tesseract."); + } + + // load image to Tesseract + ocr.SetImage(gray.data, gray.cols, gray.rows, 1, gray.step); + + // run OCR + std::string outText(ocr.GetUTF8Text()); + + ocr.End(); + + return outText; +} + +inline std::string extractTextFromImage(const std::string &file_path, + ISOLang lang) { + std::string langCode = isoToTesseractLang(lang); + return extractTextFromImageFile(file_path, langCode); +} +#pragma endregion + +#pragma region FILE_IO_IMPL +// Read File : pass file path , get string +inline std::vector readBytesFromFile(const std::string &filename) { + std::ifstream file(filename, std::ios::binary | std::ios::ate); + if (!file) { + throw std::runtime_error("Failed to open file: " + filename); + } + std::streamsize size = file.tellg(); + file.seekg(0, std::ios::beg); + std::vector buffer(size); + if (!file.read(reinterpret_cast(buffer.data()), size)) { + throw std::runtime_error("Failed to read file: " + filename); + } + return buffer; +} + +// Write Content : write content to provided Path if it is a new file +inline void writeToNewFile(const std::string &content, + const std::string &output_path) { + + if (std::filesystem::exists(output_path)) { + std::cerr << "Error: File already exists - " << output_path << std::endl; + return; + } + std::ofstream outFile(output_path); + if (!outFile) { + std::cerr << "Error opening file: " << output_path << std::endl; + return; + } + outFile << content; +} +#pragma endregion + +#pragma region SYSTEM_IMPL /* Helpers for Host Environment */ +inline void printSystemInfo() { +#ifdef __clang__ + std::cout << "Clang version: " << __clang_version__ << std::endl; +#else + std::cout << "Not using Clang." << std::endl; +#endif + +#ifdef _OPENMP + std::cout << "OpenMP is enabled." << std::endl; +#else + std::cout << "OpenMP is not enabled." << std::endl; +#endif +}; + +#pragma endregion + +#pragma region LOGGING_IMPL +enum class ANSICode { + delimiter_star, + delimiter_dim, + green_bold, + green, + error, + success_tick, + failure_cross, + warning_brightyellow, + end, +}; + +constexpr const char *ANSI(ANSICode ansi) { + switch (ansi) { + case ANSICode::delimiter_dim: + return "\x1b[90m***********************\x1b[0m"; + case ANSICode::delimiter_star: + return "\x1b[90m***********************************************************" + "********************\x1b[0m"; + case ANSICode::green: + return "\x1b[92m"; + case ANSICode::green_bold: + return "\x1b[1;32m"; + case ANSICode::error: + return "\x1b[31m"; + case ANSICode::success_tick: + return "\x1b[32m✔\x1b[0m"; + case ANSICode::failure_cross: + return "\x1b[31m✖\x1b[0m"; + case ANSICode::warning_brightyellow: + return "\x1b[93m"; + + case ANSICode::end: + return "\x1b[0m"; + default: + return ""; + } +} +#pragma endregion + +} // namespace imgstr + +#endif // TEXTRACT_H