Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Rabin-Karp string search algorithm and tests #48

Merged
merged 2 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions src/algorithms/string/rabin_karp.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#ifndef RABIN_KARP_H
#define RABIN_KARP_H

#ifdef __cplusplus
#include <string>
#include <vector>
#endif

namespace {
const int base = 26;
const int modulus = 1e9 + 7;

/**
* @brief Computes the hash value of a substring within a given string.
* @param str The input string.
* @param start The starting index of the substring.
* @param end The ending index of the substring.
* @return The computed hash value of the substring.
*/
size_t compute_hash(const std::string &str, size_t start, size_t end) {
size_t curr_mod = 1;
size_t hash_value = 0;
for(size_t i = start; i < end; i++) {
hash_value = (hash_value + (size_t(str[end - i - 1]) * curr_mod) % modulus) % modulus;
curr_mod = (curr_mod * base) % modulus;
}
return hash_value;
}

/**
* @brief Check if two substrings have a collision
*
* This function compares two substrings from two given strings to check if they have a collision.
* A collision occurs if the characters in the corresponding positions of the substrings are not equal.
*
* @param str1 The first string
* @param start1 The starting position of the first substring in str1
* @param str2 The second string
* @param start2 The starting position of the second substring in str2
* @param length The length of the substrings to compare
* @return true if the substrings have a collision, false otherwise
* @note This function assumes str1 and str2 are valid strings with lengths greater than or equal to start1 + length and start2 + length respectively.
*/
bool check_collision(const std::string &str1, size_t start1, const std::string &str2, size_t start2, size_t length) {
for(size_t i = 0; i < length; ++i) {
if(str1[start1 + i] != str2[start2 + i]) {
return false;
}
}
return true;
}
}

/**
* @brief Executes the Rabin-Karp algorithm to search for occurrences of a pattern within a text.
*
* @details
* This algorithm uses rolling hash values to efficiently compare substrings of the text with the pattern.
*
* @param text The input text to search within.
* @param pattern The pattern to search for within the text.
* @return A vector of starting indices of all occurrences of the pattern in the text. If none were found the vector is empty.
*/
std::vector<size_t> rabin_karp(const std::string &text, const std::string &pattern) {
std::vector<size_t> result;
size_t pattern_length = pattern.length();
size_t text_length = text.length();

if(pattern_length == 0) { // if pattern is empty, it can be found at every index including the end of the text
for(size_t i = 0; i <= text_length; i++) {
result.push_back(i);
}
return result;
}

if (text_length < pattern_length) { // if text is shorter than pattern, pattern can not be found
return result;
}

// calculate the hash of the pattern and the hash of the first pattern_length characters of the text
size_t pattern_hash = compute_hash(pattern, 0, pattern_length);
size_t text_hash = compute_hash(text, 0, pattern_length);

// the highest power used in the hash calculation of the pattern
size_t power = 1;
for(int i = 0; i < pattern_length - 1; ++i)
power = (power*base) % modulus;

for(size_t i = 0; i <= text_length - pattern_length; ++i) {
if(pattern_hash == text_hash && check_collision(text, i, pattern, 0, pattern_length)) {
result.push_back(i);
}

if (i < text_length - pattern_length) {
text_hash = (base*(text_hash - ((size_t)text[i]*power % modulus) + modulus) % modulus + (size_t)text[i + pattern_length]) % modulus;
}
}

return result;
}

#endif
99 changes: 99 additions & 0 deletions tests/algorithms/string/rabin_karp.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#include "../../../src/algorithms/string/rabin_karp.h"
#include "../../../third_party/catch.hpp"
#include <string>
#include <vector>

TEST_CASE("Testing Rabin Karp with no valid matches") {
std::string text = "Hello world";
std::string pattern = "abc";

std::vector<size_t> result = rabin_karp(text, pattern);
REQUIRE(result.empty());
}

TEST_CASE("Testing Rabin Karp with valid input") {
std::string text = "Hello world";
std::string pattern = "Hello";

std::vector<size_t> result = rabin_karp(text, pattern);
REQUIRE(result.size() == 1);
REQUIRE(result[0] == 0);
}

TEST_CASE("Testing Rabin Karp with incorrect input") {
std::string text = "abc";
std::string pattern = "abcdefg";

std::vector<size_t> result = rabin_karp(text, pattern);
REQUIRE(result.empty());
}

TEST_CASE("Testing Rabin Karp with multiple occurrences") {
std::string text = "abcabcabc";
std::string pattern = "abc";

std::vector<size_t> result = rabin_karp(text, pattern);
REQUIRE(result.size() == 3);
REQUIRE(result[0] == 0);
REQUIRE(result[1] == 3);
REQUIRE(result[2] == 6);
}

TEST_CASE("Testing Rabin Karp with empty text and non-empty pattern") {
std::string text = "";
std::string pattern = "abcdefg";

std::vector<size_t> result = rabin_karp(text, pattern);
REQUIRE(result.empty());
}

TEST_CASE("Testing Rabin Karp with non-empty text and empty pattern") {
std::string text = "abcdefg";
std::string pattern = "";

std::vector<size_t> result = rabin_karp(text, pattern);
REQUIRE(result.size() == text.length() + 1);
}

TEST_CASE("Testing Rabin Karp with both empty text and pattern") {
std::string text = "";
std::string pattern = "";

std::vector<size_t> result = rabin_karp(text, pattern);
REQUIRE(result.size() == 1);
REQUIRE(result[0] == 0);
}

TEST_CASE("Testing Rabin Karp with pattern equals to the text") {
std::string text = "Hello world";
std::string pattern = "Hello world";

std::vector<size_t> result = rabin_karp(text, pattern);
REQUIRE(result.size() == 1);
REQUIRE(result[0] == 0);
}

TEST_CASE("Stress testing Rabin Karp") {
std::string text(1e6, 'a');
std::string pattern(1e3, 'a');

std::vector<size_t> result = rabin_karp(text, pattern);
REQUIRE(result.size() == 1e6 - 1e3 + 1);
for (size_t i = 0; i < result.size(); i++) {
REQUIRE(result[i] == i);
}
}

TEST_CASE("Testing Rabin Karp with multiple occurrences of duplicate characters") {
std::string text = "aabcaaabcaaaab";
std::string pattern = "aa";

std::vector<size_t> result = rabin_karp(text, pattern);
REQUIRE(result.size() == 6);
REQUIRE(result[0] == 0);
REQUIRE(result[1] == 4);
REQUIRE(result[2] == 5);
REQUIRE(result[3] == 9);
REQUIRE(result[4] == 10);
REQUIRE(result[5] == 11);
}
Loading