From c6e6b8c637a1cacb2d3d7ab5abd929a5910a746b Mon Sep 17 00:00:00 2001 From: kuro337 Date: Fri, 19 Apr 2024 02:42:44 -0400 Subject: [PATCH] llvm fsdir, + unit tests --- .clang-format | 30 +++-- CMakeLists.txt | 16 ++- docs/headers/including.md | 16 +++ src/fs.cc | 63 +++++++++++ src/fs.h | 15 +++ tests/fs_test.cc | 87 ++++++++++++++ tests/ocr_test.cc | 233 ++++++++++++++++++++------------------ textract.h | 158 +++++++++++++++++++------- 8 files changed, 454 insertions(+), 164 deletions(-) create mode 100644 docs/headers/including.md create mode 100644 src/fs.cc create mode 100644 src/fs.h create mode 100644 tests/fs_test.cc diff --git a/.clang-format b/.clang-format index 7114f83..3d08b61 100644 --- a/.clang-format +++ b/.clang-format @@ -1,6 +1,8 @@ # options: https://clang.llvm.org/docs/ClangFormatStyleOptions.html +# clang-format -i to run in place - and validate Config BasedOnStyle: LLVM IndentWidth: 4 +ColumnLimit: 120 SpaceAfterCStyleCast: true UseTab: Never AlignTrailingComments: true @@ -8,6 +10,8 @@ AlignConsecutiveMacros: Consecutive AlignConsecutiveBitFields: true UseCRLF: false AlignAfterOpenBracket: Align +BinPackArguments: false # false -> if fn call args exceed will be on newlines each +BinPackParameters: false # false -> if params exceed theyll be on newlines each BreakBeforeBraces: Custom SpaceBeforeParens: ControlStatements PointerAlignment: Right @@ -42,8 +46,9 @@ AllowShortLambdasOnASingleLine: None AllowShortLoopsOnASingleLine: false AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: Yes -BinPackArguments: true -BinPackParameters: true + +AllowAllParametersOfDeclarationOnNextLine: true +AllowAllArgumentsOnNextLine: true BraceWrapping: AfterCaseLabel: false AfterClass: false @@ -56,22 +61,25 @@ BraceWrapping: AfterUnion: false BeforeCatch: false BeforeElse: false - IndentBraces: true SplitEmptyFunction: true SplitEmptyNamespace: true SplitEmptyRecord: true + # IndentBraces: true indents if statements to be non aligned within the body + BreakBeforeBinaryOperators: None BreakBeforeTernaryOperators: false BreakConstructorInitializers: AfterColon BreakInheritanceList: AfterColon BreakStringLiterals: true -ColumnLimit: 160 + +### Nests Namespaces namespace Foo { namespace Bar { CompactNamespaces: true + Cpp11BracedListStyle: true -DeriveLineEnding: false -DerivePointerAlignment: false -FixNamespaceComments: true -IncludeBlocks: Merge +# LineEnding: LF # line ending style : \n or \r\n +DerivePointerAlignment: false ## analyze for most common alignment of & and * and override PointerAlignment if diff +FixNamespaceComments: true # adds namespace comments such as // naemspace a +IncludeBlocks: Merge # sort and merge imports IndentCaseLabels: true IndentPPDirectives: None IndentWrappedFunctionNames: true @@ -79,8 +87,8 @@ KeepEmptyLinesAtTheStartOfBlocks: false MaxEmptyLinesToKeep: 1 NamespaceIndentation: All PenaltyBreakAssignment: 16 -PenaltyBreakBeforeFirstCallParameter: 2 -PenaltyBreakString: 64 -PenaltyExcessCharacter: 0 +PenaltyBreakBeforeFirstCallParameter: 0 +PenaltyBreakString: 64 # sets extra allowed cols before strings are broken onto newlines +# PenaltyExcessCharacter: 0 # sets penalty for chars once exceeding Col Limit PenaltyReturnTypeOnItsOwnLine: 0 ReflowComments: true diff --git a/CMakeLists.txt b/CMakeLists.txt index 1adb0c4..757f857 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,14 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # add_compile_options(-Wno-deprecated-declarations) +# Creates header file with path to images folder +# set(IMAGE_FOLDER_PATH "images") +# configure_file( +# "${CMAKE_CURRENT_SOURCE_DIR}/image_folder_path.h.in" +# "${CMAKE_CURRENT_BINARY_DIR}/image_folder_path.h" +# @ONLY +# ) + # ensure LLVM_DIR is set macro(add_llvm_support) set(llvm_components core support irreader) @@ -16,6 +24,7 @@ macro(add_llvm_support) endforeach() endmacro() +# include_directories(${CMAKE_SOURCE_DIR}/src) include_directories(${CMAKE_SOURCE_DIR}) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) @@ -105,10 +114,13 @@ endif() enable_testing() -set(TEST_EXECUTABLES tesseractparallel_test bulk_test pdf_test textractapi_tests ocr_test atomic_test similarity_test threadlocal_test) +set(TEST_EXECUTABLES fs_test + tesseractparallel_test bulk_test pdf_test + textractapi_tests ocr_test atomic_test similarity_test + threadlocal_test) foreach(TEST_EXECUTABLE IN LISTS TEST_EXECUTABLES) - add_executable(${TEST_EXECUTABLE} tests/${TEST_EXECUTABLE}.cc) + add_executable(${TEST_EXECUTABLE} tests/${TEST_EXECUTABLE}.cc src/fs.cc) set_target_properties(${TEST_EXECUTABLE} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests" ) diff --git a/docs/headers/including.md b/docs/headers/including.md new file mode 100644 index 0000000..dc4d958 --- /dev/null +++ b/docs/headers/including.md @@ -0,0 +1,16 @@ +# Including Headers + +`src/fs.cc` and `src/fs.h` + +- `.cc` must have the fn impl - (non-inline) or Visibility is Private +- `.cc` must include the Header def file `#include ` + +## Files that need Access to Funcs + +- Simply add the `.cc` file to the `Add Executable` so that CMake will compile + it along with the Files + +```bash +add_executable(${TEST_EXECUTABLE} tests/${TEST_EXECUTABLE}.cc src/fs.cc) + +``` diff --git a/src/fs.cc b/src/fs.cc new file mode 100644 index 0000000..45dbedc --- /dev/null +++ b/src/fs.cc @@ -0,0 +1,63 @@ + +#include "fs.h" +#include +#include +#include + +auto getFilePaths(const llvm::Twine &directoryPath) -> llvm::Expected> { + std::error_code ERR; + llvm::sys::fs::directory_iterator dirIt(directoryPath, ERR); + + if (ERR) { + return llvm::make_error(ERR.message(), ERR); + } + + llvm::sys::fs::directory_iterator dirEnd; + std::vector filePaths; + + for (; dirIt != dirEnd && !ERR; dirIt.increment(ERR)) { + if (ERR) { + return llvm::make_error(ERR.message(), ERR); + } + filePaths.push_back(dirIt->path()); + } + + return filePaths; +} +auto getFileInfo(const llvm::Twine &Path) -> llvm::Expected { + llvm::sys::fs::file_status Status; + if (std::error_code ERR = llvm::sys::fs::status(Path, Status)) { + return llvm::make_error(ERR.message(), ERR); + } + return Status; +} + +auto createDirectories(const llvm::Twine &path) -> llvm::Expected { + if (auto err = llvm::sys::fs::create_directories(path); err) { + return llvm::make_error(err.message(), err); + } + return true; +} + +auto createDirectoryForFile(const llvm::Twine &filePath) -> llvm::Expected { + llvm::SmallString<256> fullPathStorage; + filePath.toVector(fullPathStorage); + + llvm::StringRef fullPathRef(fullPathStorage); + + llvm::StringRef directoryPathRef = llvm::sys::path::parent_path(fullPathRef); + + llvm::SmallString<256> directoryPath(directoryPathRef); + + if (llvm::sys::fs::exists(directoryPath)) { + return true; + } + + if (auto err = llvm::sys::fs::create_directories(directoryPath); err) { + llvm::errs() << "Error creating directory '" << directoryPath << "': " << err.message() << "\n"; + return llvm::make_error( + llvm::formatv("Error creating directory {0}: {1}", directoryPath.str(), err.message()), err); + } + + return true; +} \ No newline at end of file diff --git a/src/fs.h b/src/fs.h new file mode 100644 index 0000000..1c2b3bd --- /dev/null +++ b/src/fs.h @@ -0,0 +1,15 @@ +// fs.h +#ifndef FS_H +#define FS_H + +#include + +auto getFilePaths(const llvm::Twine &directoryPath) -> llvm::Expected>; + +auto getFileInfo(const llvm::Twine &Path) -> llvm::Expected; + +auto createDirectories(const llvm::Twine &path) -> llvm::Expected; + +auto createDirectoryForFile(const llvm::Twine &filePath) -> llvm::Expected; + +#endif // FS_H diff --git a/tests/fs_test.cc b/tests/fs_test.cc new file mode 100644 index 0000000..6e3a59b --- /dev/null +++ b/tests/fs_test.cc @@ -0,0 +1,87 @@ +#include +#include +#include + +template +auto ErrorExists(llvm::Expected &expected, const std::string &errorMessage) -> bool { + if (!expected) { + llvm::Error err = expected.takeError(); + std::cerr << "Assert Failure:" << errorMessage << ": " << llvm::toString(std::move(err)) << '\n'; + return true; + } + return false; +} + +class LLVMFsTests: public ::testing::Test { + protected: + void SetUp() override { + } + + void TearDown() override { + } + + public: + const std::string validFolder = "../../images"; + const std::string emptyFolder = "../../empty"; +}; + +TEST_F(LLVMFsTests, GetFilePathsEmpty) { + auto result = getFilePaths(emptyFolder); + + ASSERT_FALSE(ErrorExists(result, "Expected valid result, but got an error")); + + if (!result) { + llvm::Error err = result.takeError(); + llvm::consumeError(std::move(err)); + FAIL() << "Expected valid result, but got error."; + } + ASSERT_TRUE(result->empty()) << "Expected empty directory."; +} + +TEST_F(LLVMFsTests, GetFilePaths) { + auto result = getFilePaths(validFolder); + ASSERT_FALSE(ErrorExists(result, "Valid Folder expected no Error")); + + auto emptyPathResult = getFilePaths(emptyFolder); + ASSERT_FALSE(ErrorExists(emptyPathResult, "Empty Dir should cause no Errors")); + + auto invalidPathResult = getFilePaths("path/to/non/existing/directory"); + + ASSERT_TRUE(ErrorExists(invalidPathResult, "Invalid Dir should return an Error")); +} + +TEST_F(LLVMFsTests, GetFileInfo) { + auto fileInfoResult = getFileInfo(validFolder + "/imgtext.jpeg"); + ASSERT_FALSE(ErrorExists(fileInfoResult, "File Info Valid")); + + auto directoryInfoResult = getFileInfo(validFolder); + ASSERT_FALSE(ErrorExists(directoryInfoResult, "Dir Info Valid")); + + auto nonExistentResult = getFileInfo("/path/to/non/existent/file.txt"); + ASSERT_TRUE(ErrorExists(nonExistentResult, "Non Existent should Return an Error")); +} + +TEST_F(LLVMFsTests, DirectoryCreationTests) { + auto existingDirResult = createDirectories(validFolder); + + ASSERT_FALSE(ErrorExists(existingDirResult, "Creating existing directory should not error")); + ASSERT_TRUE(*existingDirResult); + + auto newDirResult = createDirectories("llvmtest"); + + ASSERT_FALSE(ErrorExists(newDirResult, "Creating new directory should succeed")); + ASSERT_TRUE(*newDirResult); + + auto fileInNewDir = createDirectoryForFile("llvmfiletest/mock.txt"); + ASSERT_FALSE(ErrorExists(fileInNewDir, "Creating directory for new file should succeed")); + ASSERT_TRUE(*fileInNewDir); + + auto fileInExistingDir = createDirectoryForFile("llvmfiletest/mock.txt"); + ASSERT_FALSE(ErrorExists(fileInExistingDir, "Creating directory for an existing file should still succeed")); + ASSERT_TRUE(*fileInExistingDir); +} + +auto main(int argc, char **argv) -> int { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tests/ocr_test.cc b/tests/ocr_test.cc index 30cddae..e3f0946 100644 --- a/tests/ocr_test.cc +++ b/tests/ocr_test.cc @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -15,24 +16,29 @@ #include #endif -class MyTestSuite : public ::testing::Test { +class MyTestSuite: public ::testing::Test { public: + MyTestSuite() { + const std::string imgFolder = "../../images"; + auto imagePaths = getFilePaths(imgFolder); + + if (imagePaths) { + fpaths = imagePaths.get(); + } + + for (const auto &e: fpaths) { + std::cout << e << '\n'; + } + } + + std::vector fpaths; + const std::string inputOpenTest = "../../images/screenshot.png"; std::string tempDir = "tmpOCR"; - const std::string path = "../../images/"; - const std::string inputFile = "../../images/imgtext.jpeg"; - const std::vector images = {"screenshot.png", "imgtext.jpeg", - "compleximgtext.png", - "scatteredtext.png"}; protected: void TearDown() override { - - std::filesystem::path tmp{std::filesystem::temp_directory_path()}; - std::filesystem::create_directories(tmp / "abcdef/example"); - std::uintmax_t n{std::filesystem::remove_all(tmp / "abcdef")}; - std::cout << "Deleted " << n << " files or directories\n"; } }; @@ -47,45 +53,59 @@ TEST_F(MyTestSuite, ConvertImageToTextFile) { std::cout << "Running Convert Test" << std::endl; imgstr::ImgProcessor imageTranslator; - imageTranslator.convertImageToTextFile(inputFile, tempDir); - - bool fileExists = std::filesystem::exists(tempDir + "/" + "imgtext.txt"); - ASSERT_TRUE(fileExists); -} - -TEST_F(MyTestSuite, WriteFileTest) { - - std::vector paths; - std::transform(images.begin(), images.end(), std::back_inserter(paths), - [&](const std::string &img) { return path + img; }); - imgstr::ImgProcessor imageTranslator; - EXPECT_NO_THROW(imageTranslator.addFiles(paths)); - EXPECT_NO_THROW(imageTranslator.convertImagesToTextFiles(tempDir)); - - std::vector test_lengths = {20, 1, 1, 9}; - - for (size_t i = 0; i < images.size(); ++i) { - std::size_t lastDot = images[i].find_last_of('.'); - std::string expectedOutputPath = - tempDir + "/" + images[i].substr(0, lastDot) + ".txt"; - - bool fileExists = std::filesystem::exists(expectedOutputPath); - ASSERT_TRUE(fileExists); - std::ifstream outputFile(expectedOutputPath); - ASSERT_TRUE(outputFile.is_open()); + imageTranslator.convertImageToTextFile(fpaths[0], tempDir); + std::filesystem::path p(fpaths[0]); - int lineCount = 0; - std::string line; - while (std::getline(outputFile, line)) { - lineCount++; - } + bool fileExists = std::filesystem::exists(tempDir + "/" + p.filename().string()); + std::cout << "Tempdir File check: " << tempDir + "/" + p.filename().string() << '\n'; - outputFile.close(); - EXPECT_GE(lineCount, test_lengths[i]); - } + ASSERT_TRUE(fileExists); } +// TEST_F(MyTestSuite, WriteFileTest) { +// // std::vector paths; +// // std::transform(images.begin(), images.end(), +// std::back_inserter(paths), [&](const std::string &img) { +// // return path + img; +// // }); +// imgstr::ImgProcessor imageTranslator; +// EXPECT_NO_THROW(imageTranslator.addFiles(fpaths)); +// EXPECT_NO_THROW(imageTranslator.convertImagesToTextFiles(tempDir)); + +// std::vector test_lengths = {20, 1, 1, 9}; + +// for (size_t i = 0; i < fpaths.size(); ++i) { +// std::size_t lastDot = fpaths[i].find_last_of('.'); +// // std::string expectedOutputPath = tempDir + "/" + +// fpaths[i].substr(0, lastDot) + ".txt"; + +// std::filesystem::path p(fpaths[i]); +// auto expectedOutputPath = tempDir + "/" + +// p.filename().string(); std::filesystem::path +// p2(expectedOutputPath); p2.replace_extension(".txt"); +// expectedOutputPath = p2.string(); +// bool fileExists = std::filesystem::exists(expectedOutputPath); +// // imageTranslator.convertImageToTextFile(fpaths[0], tempDir + +// "/" + p.filename().string()); + +// std::cout << "File Exists WriteFile:" << expectedOutputPath << +// '\n'; ASSERT_TRUE(fileExists); + +// std::ifstream outputFile(expectedOutputPath); +// ASSERT_TRUE(outputFile.is_open()); + +// int lineCount = 0; +// std::string line; +// while (std::getline(outputFile, line)) { +// lineCount++; +// } + +// outputFile.close(); +// EXPECT_GE(lineCount, test_lengths[i]); +// } +// } + TEST_F(MyTestSuite, BasicAssertions) { tesseract::TessBaseAPI ocr; @@ -99,13 +119,11 @@ TEST_F(MyTestSuite, BasicAssertions) { try { ocr.Init(nullptr, "nonexistent"); } catch (const std::exception &e) { - - std::cout - << "To enable all languages for Tesseract - make sure pack is " - "installed.\nhttps://github.com/dataiku/" - "dss-plugin-tesseract-ocr/" - "tree/v1.0.2#specific-languages\n" - << std::endl; + std::cout << "To enable all languages for Tesseract - make sure pack is " + "installed.\nhttps://github.com/dataiku/" + "dss-plugin-tesseract-ocr/" + "tree/v1.0.2#specific-languages\n" + << std::endl; EXPECT_STRNE(e.what(), "Please make sure the TESSDATA_PREFIX environment " @@ -116,78 +134,69 @@ TEST_F(MyTestSuite, BasicAssertions) { EXPECT_EQ(7 * 6, 42); } -auto -extractTextFromImageFileLeptonica(const std::string &file_path, - const std::string &lang = "eng") - -> std::string { +// auto extractTextFromImageFileLeptonica(const std::string &file_path, const +// std::string &lang = "eng") -> std::string { +// auto *api = new tesseract::TessBaseAPI(); +// if (api->Init(nullptr, "eng") != 0) { +// fprintf(stderr, "Could not initialize tesseract.\n"); +// exit(1); +// } +// Pix *image = pixRead(file_path.c_str()); - auto *api = new tesseract::TessBaseAPI(); - if (api->Init(nullptr, "eng") != 0) { - fprintf(stderr, "Could not initialize tesseract.\n"); - exit(1); - } - Pix *image = pixRead(file_path.c_str()); +// // fully automatic - suitable for single columns of text - // fully automatic - suitable for single columns of text +// api->SetPageSegMode(tesseract::PSM_AUTO); - api->SetPageSegMode(tesseract::PSM_AUTO); +// api->SetImage(image); +// std::string outText(api->GetUTF8Text()); +// outText = api->GetUTF8Text(); - api->SetImage(image); - std::string outText(api->GetUTF8Text()); - outText = api->GetUTF8Text(); +// api->End(); +// delete api; +// pixDestroy(&image); +// return outText; +// } - api->End(); - delete api; - pixDestroy(&image); - return outText; -} +// auto extractTextLSTM(const std::string &file_path, const std::string &lang = +// "eng") -> std::string { +// auto *api = new tesseract::TessBaseAPI(); +// if (api->Init(nullptr, "eng", tesseract::OEM_LSTM_ONLY) != 0) { +// fprintf(stderr, "Could not initialize tesseract.\n"); +// exit(1); +// } +// Pix *image = pixRead(file_path.c_str()); -auto -extractTextLSTM(const std::string &file_path, - const std::string &lang = "eng") -> std::string { +// api->SetImage(image); +// std::string outText(api->GetUTF8Text()); +// outText = api->GetUTF8Text(); - auto *api = new tesseract::TessBaseAPI(); - if (api->Init(nullptr, "eng", tesseract::OEM_LSTM_ONLY) != 0) { - fprintf(stderr, "Could not initialize tesseract.\n"); - exit(1); - } - Pix *image = pixRead(file_path.c_str()); +// api->End(); +// delete api; +// pixDestroy(&image); +// return outText; +// } - api->SetImage(image); - std::string outText(api->GetUTF8Text()); - outText = api->GetUTF8Text(); +// TEST_F(MyTestSuite, OEMvsLSTMAnalysis) { +// auto start = imgstr::getStartTime(); +// auto res1 = extractTextFromImageFileLeptonica(fpaths[1]); - api->End(); - delete api; - pixDestroy(&image); - return outText; -} - -TEST_F(MyTestSuite, OEMvsLSTMAnalysis) { - - auto start = imgstr::getStartTime(); - auto res1 = extractTextFromImageFileLeptonica(inputOpenTest); +// std::cout << res1 << '\n'; - std::cout << res1 << '\n'; +// auto time1 = imgstr::getDuration(start); +// std::cout << "Time Leptonica : " << time1 << '\n'; - auto time1 = imgstr::getDuration(start); - std::cout << "Time Leptonica : " << time1 << '\n'; +// auto start2 = imgstr::getStartTime(); +// auto res2 = extractTextLSTM(fpaths[1]); - auto start2 = imgstr::getStartTime(); - auto res2 = extractTextLSTM(inputOpenTest); +// std::cout << res2 << '\n'; - std::cout << res2 << '\n'; - - auto time2 = imgstr::getDuration(start); - std::cout << "Time LSTM: " << time2 << '\n'; -} +// auto time2 = imgstr::getDuration(start); +// std::cout << "Time LSTM: " << time2 << '\n'; +// } #ifdef _USE_OPENCV -std::string -extractTextFromImageFileOpenCV(const std::string &file_path, - const std::string &lang = "eng") { - +std::string extractTextFromImageFileOpenCV(const std::string &file_path, const std::string &lang = "eng") { cv::Mat img = cv::imread(file_path); if (img.empty()) { throw std::runtime_error("Failed to load image: " + file_path); @@ -214,14 +223,13 @@ extractTextFromImageFileOpenCV(const std::string &file_path, #ifdef _USE_OPENCV TEST_F(MyTestSuite, LEPTONICA_VS_OPENCV) { - auto start = getStartTime(); - auto res1 = extractTextFromImageFileLeptonica(inputOpenTest); + auto res1 = extractTextFromImageFileLeptonica(inputOpenTest); auto t1 = getDuration(start); std::cout << "Time Leptonica : " << t1 << '\n'; - auto s2 = getStartTime(); + auto s2 = getStartTime(); auto res2 = extractTextFromImageFileOpenCV(inputOpenTest); auto t2 = getDuration(start); @@ -230,8 +238,7 @@ TEST_F(MyTestSuite, LEPTONICA_VS_OPENCV) { #endif -auto -main(int argc, char **argv) -> int { +auto main(int argc, char **argv) -> int { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/textract.h b/textract.h index 1d97593..02f7f78 100644 --- a/textract.h +++ b/textract.h @@ -76,8 +76,10 @@ namespace imgstr { static constexpr auto WARNING = "\x1b[93m"; static constexpr auto WARNING_BOLD = "\x1b[1;33m"; static constexpr auto END = "\x1b[0m"; - static constexpr auto DELIMITER_STAR = "\x1b[90m******************************************************\x1b["; - static constexpr auto DELIMITER_DIM = "\x1b[90m******************************************************\x1b["; + static constexpr auto DELIMITER_STAR = "\x1b[90m******************************************************" + "\x1b["; + static constexpr auto DELIMITER_DIM = "\x1b[90m******************************************************" + "\x1b["; static constexpr auto DELIMITER_ITEM = "--------------------------------------------------------------"; } // namespace Ansi @@ -104,7 +106,9 @@ namespace imgstr { auto createDirIfNotExists(const std::string &output_dir, char path_separator = SEPARATOR) -> bool; - auto createQualifiedFilePath(const std::string &fileName, const std::string &directory, char path_separator = SEPARATOR) -> std::string; + auto createQualifiedFilePath(const std::string &fileName, + const std::string &directory, + char path_separator = SEPARATOR) -> std::string; auto getFilePaths(const llvm::Twine &directoryPath) -> std::vector; @@ -139,7 +143,9 @@ namespace imgstr { enum class ImgMode { document, image }; - auto getTextOCR(const std::vector &file_content, const std::string &lang, ImgMode img_mode) -> std::string; + auto getTextOCR(const std::vector &file_content, + const std::string &lang, + ImgMode img_mode) -> std::string; auto getTextImgFile(const std::string &file_path, const std::string &lang = "eng") -> std::string; @@ -149,7 +155,9 @@ namespace imgstr { void createPDF(const std::string &input_path, const std::string &output_path, const char *datapath = DATAPATH); - auto getTextOCRNoClear(const std::vector &file_content, const std::string &lang = "eng", ImgMode img_mode = ImgMode::document) -> std::string; + auto getTextOCRNoClear(const std::vector &file_content, + const std::string &lang = "eng", + ImgMode img_mode = ImgMode::document) -> std::string; #ifdef _USE_OPENCV @@ -288,7 +296,8 @@ namespace imgstr { if (ocrPtr->Init(nullptr, lang.c_str()) != 0) { delete ocrPtr; ocrPtr = nullptr; - throw std::runtime_error("Could not initialize tesseract."); + throw std::runtime_error("Could not initialize " + "tesseract."); } if (mode == ImgMode::image) { /* Optimized for Complex Images */ sout << "Image mode set\n"; @@ -338,8 +347,8 @@ namespace imgstr { Cache retrieval logic is determined by the SHA256 hash of the Image bytes - The SHA256 Byte Hash enables duplicate images to not be processed even if the - file names or paths differ. + The SHA256 Byte Hash enables duplicate images to not be processed even if + the file names or paths differ. */ @@ -374,7 +383,9 @@ namespace imgstr { time_processed(getCurrentTimestamp()) { } - void updateWriteInfo(const std::string &output_path, const std::string &write_timestamp, bool output_written) const { + void updateWriteInfo(const std::string &output_path, + const std::string &write_timestamp, + bool output_written) const { if (!mutex) { mutex = std::make_unique(); } @@ -458,7 +469,8 @@ namespace imgstr { } } - auto getImageOrProcess(const std::string &file_path, ISOLang lang = ISOLang::en) -> std::optional> { + auto getImageOrProcess(const std::string &file_path, + ISOLang lang = ISOLang::en) -> std::optional> { return processImageFile(file_path); } @@ -504,7 +516,8 @@ namespace imgstr { double newTime = 0.0; do { newTime = current + timeToAdd; - } while (!totalTime.compare_exchange_weak(current, newTime, std::memory_order_relaxed, std::memory_order_relaxed)); + } while (!totalTime.compare_exchange_weak( + current, newTime, std::memory_order_relaxed, std::memory_order_relaxed)); } auto getAverageProcessingTime() -> double { @@ -517,7 +530,8 @@ namespace imgstr { #endif logger->log() << "Processor Initialized" << '\n' - << "Threads Available: " << BOLD_WHITE << omp_get_max_threads() << END << "\nCores Available: " << BOLD_WHITE << omp_get_num_procs() << END << '\n'; + << "Threads Available: " << BOLD_WHITE << omp_get_max_threads() << END + << "\nCores Available: " << BOLD_WHITE << omp_get_num_procs() << END << '\n'; } void printCacheHit(const std::string &file) { @@ -529,7 +543,9 @@ namespace imgstr { } void printInputFileAlreadyProcessed(const std::string &file) { - logger->log() << DELIMITER_STAR << '\n' << WARNING << "File at path : " << END << file << "has already been processed to text" << '\n'; + logger->log() << DELIMITER_STAR << '\n' + << WARNING << "File at path : " << END << file << "has already been processed to text" + << '\n'; } void fileOpenErrorLog(const std::string &output_path) { @@ -537,7 +553,9 @@ namespace imgstr { } void overWriteLog(const std::string &output_path) { - logger->log() << WARNING_BOLD << "WARNING: " << END << WARNING << "File already exists - " << END << BOLD_WHITE << output_path << END << " Are you sure you want to overwrite the file?" << '\n'; + logger->log() << WARNING_BOLD << "WARNING: " << END << WARNING << "File already exists - " << END + << BOLD_WHITE << output_path << END << " Are you sure you want to overwrite the file?" + << '\n'; } void filesAlreadyProcessedLog() { @@ -546,8 +564,8 @@ namespace imgstr { void printOutputAlreadyWritten(const Image &image) { logger->log() << DELIMITER_STAR << '\n' - << WARNING << image.getName() << " Already Processed and written to " << END << image.write_info.output_path << " at " - << image.write_info.write_timestamp << '\n'; + << WARNING << image.getName() << " Already Processed and written to " << END + << image.write_info.output_path << " at " << image.write_info.write_timestamp << '\n'; } void printProcessingFile(const std::string &file) { @@ -556,7 +574,8 @@ namespace imgstr { void printProcessingDuration(double duration_ms) { logger->log() << DELIMITER_STAR << '\n' - << BOLD_WHITE << queued.size() << END << " Files Processed and Converted in " << BRIGHT_WHITE << duration_ms << " seconds\n" + << BOLD_WHITE << queued.size() << END << " Files Processed and Converted in " << BRIGHT_WHITE + << duration_ms << " seconds\n" << END << DELIMITER_STAR << "\n"; } @@ -582,18 +601,35 @@ namespace imgstr { "{3}Output Written: {1}{9}\n" "{3}Write Timestamp: {1}{10}\n" "{11}\n", - Ansi::GREEN_BOLD, Ansi::END, img.image_sha256, Ansi::BLUE, img.path, img.image_size, img.text_size, img.time_processed, - img.write_info.output_path, (img.write_info.output_written ? "Yes" : "No"), img.write_info.write_timestamp, Ansi::DELIMITER_ITEM) + Ansi::GREEN_BOLD, + Ansi::END, + img.image_sha256, + Ansi::BLUE, + img.path, + img.image_size, + img.text_size, + img.time_processed, + img.write_info.output_path, + (img.write_info.output_written ? "Yes" : "No"), + img.write_info.write_timestamp, + Ansi::DELIMITER_ITEM) .str(); - logstream << GREEN_BOLD << std::left << std::setw(width) << "SHA256: " << END << img.image_sha256 << '\n' + logstream << GREEN_BOLD << std::left << std::setw(width) << "SHA256: " << END << img.image_sha256 + << '\n' << BLUE << std::left << std::setw(width) << "Path: " << END << img.path << '\n' - << BLUE << std::left << std::setw(width) << "Image Size: " << END << img.image_size << " bytes\n" - << BLUE << std::left << std::setw(width) << "Text Size: " << END << img.text_size << " bytes\n" - << BLUE << std::left << std::setw(width) << "Processed Time: " << END << img.time_processed << '\n' - << BLUE << std::left << std::setw(width) << "Output Path: " << END << img.write_info.output_path << '\n' - << BLUE << std::left << std::setw(width) << "Output Written: " << END << (img.write_info.output_written ? "Yes" : "No") << '\n' - << BLUE << std::left << std::setw(width) << "Write Timestamp: " << END << img.write_info.write_timestamp << '\n' + << BLUE << std::left << std::setw(width) << "Image Size: " << END << img.image_size + << " bytes\n" + << BLUE << std::left << std::setw(width) << "Text Size: " << END << img.text_size + << " bytes\n" + << BLUE << std::left << std::setw(width) << "Processed Time: " << END + << img.time_processed << '\n' + << BLUE << std::left << std::setw(width) << "Output Path: " << END + << img.write_info.output_path << '\n' + << BLUE << std::left << std::setw(width) << "Output Written: " << END + << (img.write_info.output_written ? "Yes" : "No") << '\n' + << BLUE << std::left << std::setw(width) << "Write Timestamp: " << END + << img.write_info.write_timestamp << '\n' << DELIMITER_ITEM << '\n'; } @@ -601,8 +637,10 @@ namespace imgstr { } void destructionLog() { - logger->log() << LIGHT_GREY << "Destructor called - freeing " << BRIGHT_WHITE << imgstr::TesseractThreadCount.load(std::memory_order_relaxed) << END << " Tesseracts\n" - << END << "\nAverage Image Processing Latency: " << BOLD_WHITE << getAverageProcessingTime() << END << " ms\n" + logger->log() << LIGHT_GREY << "Destructor called - freeing " << BRIGHT_WHITE + << imgstr::TesseractThreadCount.load(std::memory_order_relaxed) << END << " Tesseracts\n" + << END << "\nAverage Image Processing Latency: " << BOLD_WHITE << getAverageProcessingTime() + << END << " ms\n" << '\n' << @@ -646,7 +684,9 @@ namespace imgstr { return std::nullopt; } - void processImagesDir(const std::string &directory, bool write_output = false, const std::string &output_path = "") { + void processImagesDir(const std::string &directory, + bool write_output = false, + const std::string &output_path = "") { auto files = getFilePaths(directory); for (const auto &filePath: files) { @@ -692,11 +732,24 @@ namespace imgstr { outFile << content; } - void convertImageToTextFile(const std::string &input_file, const std::string &output_dir = "", ISOLang lang = ISOLang::en) { + /** + * @brief Converts an image file to a text file. + * + * @param input_file The input image file. + * @param output_dir The output directory (optional). + * @param lang The language of the text (optional, default: en). + * @return void + * @usage convertImageToTextFile("image.jpg", "output_dir", ISOLang::en); + */ + void convertImageToTextFile(const std::string &input_file, + const std::string &output_dir = "", + ISOLang lang = ISOLang::en) { createDirIfNotExists(output_dir); std::string output_file = createQualifiedFilePath(input_file, output_dir); + logger->log() << "Output file : " << output_file << '\n'; + auto imageOpt = getImageOrProcess(input_file, lang); if (imageOpt) { @@ -745,7 +798,9 @@ namespace imgstr { queued.clear(); } - void generatePDF(const std::string &input_path, const std::string &output_path, const char *datapath = DATAPATH) { + void generatePDF(const std::string &input_path, + const std::string &output_path, + const char *datapath = DATAPATH) { try { createPDF(input_path, output_path); @@ -904,10 +959,13 @@ namespace imgstr { } inline void tesseractInvokeLog(ImgMode img_mode) { - serr << ERROR << "getTextOCR " << (img_mode == ImgMode::document ? "document mode " : "image mode ") << END << " -> called from thread: " << omp_get_thread_num() << '\n'; + serr << ERROR << "getTextOCR " << (img_mode == ImgMode::document ? "document mode " : "image mode ") << END + << " -> called from thread: " << omp_get_thread_num() << '\n'; } - inline auto getTextOCR(const std::vector &file_content, const std::string &lang, ImgMode img_mode = ImgMode::document) -> std::string { + inline auto getTextOCR(const std::vector &file_content, + const std::string &lang, + ImgMode img_mode = ImgMode::document) -> std::string { /* Leptonica reads 40% or more faster than OpenCV */ #ifdef _DEBUGAPP @@ -919,7 +977,8 @@ namespace imgstr { } Pix *image = pixReadMem(static_cast(file_content.data()), file_content.size()); if (image == nullptr) { - throw std::runtime_error("Failed to load image from memory buffer"); + throw std::runtime_error("Failed to load image from memory " + "buffer"); } thread_local_tesserat->SetImage(image); @@ -956,13 +1015,16 @@ namespace imgstr { delete renderer; } - inline auto getTextOCRNoClear(const std::vector &file_content, const std::string &lang, ImgMode img_mode) -> std::string { + inline auto getTextOCRNoClear(const std::vector &file_content, + const std::string &lang, + ImgMode img_mode) -> std::string { if (thread_local_tesserat.ocrPtr == nullptr) { thread_local_tesserat.init(lang, img_mode); } Pix *image = pixReadMem(static_cast(file_content.data()), file_content.size()); if (image == nullptr) { - throw std::runtime_error("Failed to load image from memory buffer"); + throw std::runtime_error("Failed to load image from memory " + "buffer"); } thread_local_tesserat->SetImage(image); @@ -1023,7 +1085,8 @@ namespace imgstr { return outText; }; - inline std::string extractTextFromImageBytes(const std::vector &file_content, const std::string &lang = "eng") { + inline std::string + extractTextFromImageBytes(const std::vector &file_content, const std::string &lang = "eng") { cv::Mat img = cv::imdecode(file_content, cv::IMREAD_COLOR); if (img.empty()) { throw std::runtime_error("Failed to load image from buffer"); @@ -1076,8 +1139,17 @@ namespace imgstr { #pragma endregion +// #pragma region FILE_IO_IMPL /* STL File I/O Implementations */ + /** + * @brief Create Dir If Not Exists + * + * @param output_dir + * @param path_separator + * @return true + * @usage createDirIfNotExists("path/to/file", '/) + */ inline auto createDirIfNotExists(const std::string &output_dir, const char path_separator) -> bool { if (!output_dir.empty() && !fileExists(output_dir)) { return createDirectories(output_dir); @@ -1085,7 +1157,17 @@ namespace imgstr { return true; } - inline auto createQualifiedFilePath(const std::string &fileName, const std::string &directory, const char path_separator) -> std::string { + /** + * @brief Create a Qualified File Path object + * + * @param fileName + * @param directory + * @param path_separator + * @return std::string + */ + inline auto createQualifiedFilePath(const std::string &fileName, + const std::string &directory, + const char path_separator) -> std::string { if (!directory.empty() && !fileExists(directory) && !createDirectories(directory)) { throw std::runtime_error("Failed to Create Directory at: " + directory); }