diff --git a/src/algorithms/string/rabin_karp.h b/src/algorithms/string/rabin_karp.h new file mode 100644 index 00000000..e6221712 --- /dev/null +++ b/src/algorithms/string/rabin_karp.h @@ -0,0 +1,102 @@ +#ifndef RABIN_KARP_H +#define RABIN_KARP_H + +#ifdef __cplusplus +#include +#include +#endif + +namespace { + const int base = 26; + const int modulus = 1e9 + 7; + + /** + * @brief Computes the hash value of a substring within a given string. + * @param str The input string. + * @param start The starting index of the substring. + * @param end The ending index of the substring. + * @return The computed hash value of the substring. + */ + size_t compute_hash(const std::string &str, size_t start, size_t end) { + size_t curr_mod = 1; + size_t hash_value = 0; + for(size_t i = start; i < end; i++) { + hash_value = (hash_value + (size_t(str[end - i - 1]) * curr_mod) % modulus) % modulus; + curr_mod = (curr_mod * base) % modulus; + } + return hash_value; + } + + /** + * @brief Check if two substrings have a collision + * + * This function compares two substrings from two given strings to check if they have a collision. + * A collision occurs if the characters in the corresponding positions of the substrings are not equal. + * + * @param str1 The first string + * @param start1 The starting position of the first substring in str1 + * @param str2 The second string + * @param start2 The starting position of the second substring in str2 + * @param length The length of the substrings to compare + * @return true if the substrings have a collision, false otherwise + * @note This function assumes str1 and str2 are valid strings with lengths greater than or equal to start1 + length and start2 + length respectively. + */ + bool check_collision(const std::string &str1, size_t start1, const std::string &str2, size_t start2, size_t length) { + for(size_t i = 0; i < length; ++i) { + if(str1[start1 + i] != str2[start2 + i]) { + return false; + } + } + return true; + } +} + +/** + * @brief Executes the Rabin-Karp algorithm to search for occurrences of a pattern within a text. + * + * @details + * This algorithm uses rolling hash values to efficiently compare substrings of the text with the pattern. + * + * @param text The input text to search within. + * @param pattern The pattern to search for within the text. + * @return A vector of starting indices of all occurrences of the pattern in the text. If none were found the vector is empty. + */ +std::vector rabin_karp(const std::string &text, const std::string &pattern) { + std::vector result; + size_t pattern_length = pattern.length(); + size_t text_length = text.length(); + + if(pattern_length == 0) { // if pattern is empty, it can be found at every index including the end of the text + for(size_t i = 0; i <= text_length; i++) { + result.push_back(i); + } + return result; + } + + if (text_length < pattern_length) { // if text is shorter than pattern, pattern can not be found + return result; + } + + // calculate the hash of the pattern and the hash of the first pattern_length characters of the text + size_t pattern_hash = compute_hash(pattern, 0, pattern_length); + size_t text_hash = compute_hash(text, 0, pattern_length); + + // the highest power used in the hash calculation of the pattern + size_t power = 1; + for(int i = 0; i < pattern_length - 1; ++i) + power = (power*base) % modulus; + + for(size_t i = 0; i <= text_length - pattern_length; ++i) { + if(pattern_hash == text_hash && check_collision(text, i, pattern, 0, pattern_length)) { + result.push_back(i); + } + + if (i < text_length - pattern_length) { + text_hash = (base*(text_hash - ((size_t)text[i]*power % modulus) + modulus) % modulus + (size_t)text[i + pattern_length]) % modulus; + } + } + + return result; +} + +#endif \ No newline at end of file diff --git a/tests/algorithms/string/rabin_karp.cc b/tests/algorithms/string/rabin_karp.cc new file mode 100644 index 00000000..30f296ed --- /dev/null +++ b/tests/algorithms/string/rabin_karp.cc @@ -0,0 +1,99 @@ +#include "../../../src/algorithms/string/rabin_karp.h" +#include "../../../third_party/catch.hpp" +#include +#include + +TEST_CASE("Testing Rabin Karp with no valid matches") { + std::string text = "Hello world"; + std::string pattern = "abc"; + + std::vector result = rabin_karp(text, pattern); + REQUIRE(result.empty()); +} + +TEST_CASE("Testing Rabin Karp with valid input") { + std::string text = "Hello world"; + std::string pattern = "Hello"; + + std::vector result = rabin_karp(text, pattern); + REQUIRE(result.size() == 1); + REQUIRE(result[0] == 0); +} + +TEST_CASE("Testing Rabin Karp with incorrect input") { + std::string text = "abc"; + std::string pattern = "abcdefg"; + + std::vector result = rabin_karp(text, pattern); + REQUIRE(result.empty()); +} + +TEST_CASE("Testing Rabin Karp with multiple occurrences") { + std::string text = "abcabcabc"; + std::string pattern = "abc"; + + std::vector result = rabin_karp(text, pattern); + REQUIRE(result.size() == 3); + REQUIRE(result[0] == 0); + REQUIRE(result[1] == 3); + REQUIRE(result[2] == 6); +} + +TEST_CASE("Testing Rabin Karp with empty text and non-empty pattern") { + std::string text = ""; + std::string pattern = "abcdefg"; + + std::vector result = rabin_karp(text, pattern); + REQUIRE(result.empty()); +} + +TEST_CASE("Testing Rabin Karp with non-empty text and empty pattern") { + std::string text = "abcdefg"; + std::string pattern = ""; + + std::vector result = rabin_karp(text, pattern); + REQUIRE(result.size() == text.length() + 1); +} + +TEST_CASE("Testing Rabin Karp with both empty text and pattern") { + std::string text = ""; + std::string pattern = ""; + + std::vector result = rabin_karp(text, pattern); + REQUIRE(result.size() == 1); + REQUIRE(result[0] == 0); +} + +TEST_CASE("Testing Rabin Karp with pattern equals to the text") { + std::string text = "Hello world"; + std::string pattern = "Hello world"; + + std::vector result = rabin_karp(text, pattern); + REQUIRE(result.size() == 1); + REQUIRE(result[0] == 0); +} + +TEST_CASE("Stress testing Rabin Karp") { + std::string text(1e6, 'a'); + std::string pattern(1e3, 'a'); + + std::vector result = rabin_karp(text, pattern); + REQUIRE(result.size() == 1e6 - 1e3 + 1); + for (size_t i = 0; i < result.size(); i++) { + REQUIRE(result[i] == i); + } +} + +TEST_CASE("Testing Rabin Karp with multiple occurrences of duplicate characters") { + std::string text = "aabcaaabcaaaab"; + std::string pattern = "aa"; + + std::vector result = rabin_karp(text, pattern); + REQUIRE(result.size() == 6); + REQUIRE(result[0] == 0); + REQUIRE(result[1] == 4); + REQUIRE(result[2] == 5); + REQUIRE(result[3] == 9); + REQUIRE(result[4] == 10); + REQUIRE(result[5] == 11); +} \ No newline at end of file