Skip to content

Commit

Permalink
fuzzy match implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
nticaric committed Jul 7, 2017
1 parent 1687c93 commit fdd9189
Show file tree
Hide file tree
Showing 4 changed files with 1,775 additions and 12 deletions.
23 changes: 11 additions & 12 deletions helper/helpers.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,17 @@ function stringEndsWith($haystack, $needle)
}

if (!function_exists('fuzzyMatch')) {
function fuzzyMatch($pattern, $str)
function fuzzyMatch($pattern, $items)
{
$j = 0;
$patternLength = strlen($pattern);
$strLength = strlen($str);

for ($i = 0; $i < $strLength && $j < $patternLength; $i++) {
if ($pattern[$j] == $str[$i]) {
$j++;
}
}
$fm = new TeamTNT\TNTSearch\TNTFuzzyMatch;
return $fm->fuzzyMatch($pattern, $items);
}
}

return ($j == $patternLength);
if (!function_exists('fuzzyMatchFromFile')) {
function fuzzyMatchFromFile($pattern, $path)
{
$fm = new TeamTNT\TNTSearch\TNTFuzzyMatch;
return $fm->fuzzyMatchFromFile($pattern, $path);
}
}
}
144 changes: 144 additions & 0 deletions src/TNTFuzzyMatch.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
<?php

namespace TeamTNT\TNTSearch;

class TNTFuzzyMatch
{
public function norm($vec)
{
$norm = 0;
$components = count($vec);

for ($i = 0; $i < $components; $i++) {
$norm += $vec[$i] * $vec[$i];
}

return sqrt($norm);
}

public function dot($vec1, $vec2)
{
$prod = 0;
$components = count($vec1);

for ($i = 0; $i < $components; $i++) {
$prod += ($vec1[$i] * $vec2[$i]);
}

return $prod;
}

public function wordToVector($word)
{
$alphabet = "aAbBcCčČćĆdDđĐeEfFgGhHiIjJkKlLmMnNoOpPqQrRsSšŠtTvVuUwWxXyYzZžŽ1234567890'+ /";

$result = [];
foreach (str_split($word) as $w) {
$result[] = strpos($alphabet, $w) + 1000000;
}
return $result;
}

public function angleBetweenVectors($a, $b)
{
$denominator = ($this->norm($a) * $this->norm($b));

if ($denominator == 0) {
return 0;
}

return $this->dot($a, $b) / $denominator;
}

public function hasCommonSubsequence($pattern, $str)
{
$j = 0;
$patternLength = strlen($pattern);
$strLength = strlen($str);

for ($i = 0; $i < $strLength && $j < $patternLength; $i++) {
if ($pattern[$j] == $str[$i]) {
$j++;
}
}

return ($j == $patternLength);
}

public function makeVectorSameLength($str, $pattern)
{
$j = 0;
$max = max(count($pattern), count($str));
$a = [];
$b = [];

for ($i = 0; $i < $max && $j < $max; $i++) {
if (isset($pattern[$j]) && isset($str[$i]) && $pattern[$j] == $str[$i]) {
$j++;
$b[] = $str[$i];
} else {
$b[] = 0;
}
}

return $b;
}

public function fuzzyMatchFromFile($pattern, $path)
{
$res = [];
$lines = fopen($path, "r");
if ($lines) {
while (!feof($lines)) {
$line = fgets($lines, 4096);
if ($this->hasCommonSubsequence($pattern, $line)) {
$res[] = $line;
}
}
fclose($lines);
}

$paternVector = $this->wordToVector($pattern);

$sorted = [];
foreach ($res as $word) {
$word = trim($word);
$wordVector = $this->wordToVector($word);
$normalizedPaternVector = $this->makeVectorSameLength($wordVector, $paternVector);

$angle = $this->angleBetweenVectors($wordVector, $normalizedPaternVector);
$sorted[$word] = $angle;
}

arsort($sorted);

return $sorted;
}

public function fuzzyMatch($pattern, $items)
{
$res = [];

foreach ($items as $item) {
if ($this->hasCommonSubsequence($pattern, $item)) {
$res[] = $item;
}
}

$paternVector = $this->wordToVector($pattern);

$sorted = [];
foreach ($res as $word) {
$word = trim($word);
$wordVector = $this->wordToVector($word);
$normalizedPaternVector = $this->makeVectorSameLength($wordVector, $paternVector);

$angle = $this->angleBetweenVectors($wordVector, $normalizedPaternVector);
$sorted[$word] = $angle;
}

arsort($sorted);

return $sorted;
}
}
110 changes: 110 additions & 0 deletions tests/TNTFuzzyMatchTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
<?php

use TeamTNT\TNTSearch\TNTFuzzyMatch;

class TNTGeoSearchTest extends PHPUnit_Framework_TestCase
{
public function __construct()
{
$this->fm = new TNTFuzzyMatch;
}

public function testNorm()
{
$vector = [3, 4];
$normalized = $this->fm->norm($vector);
$this->assertEquals(5, $normalized);

$vector = [1, 2, 3, 4, 5];
$normalized = $this->fm->norm($vector);
$this->assertEquals(7.416198487095663, $normalized);
}

public function testDot()
{
$vector1 = [1, 2, -5];
$vector2 = [4, 8, 1];

$product = $this->fm->dot($vector1, $vector2);

$this->assertEquals(15, $product);
}

public function testWordToVector()
{
$word = "TNT";
$vector = $this->fm->wordToVector($word);
$this->assertEquals($vector, [1000055, 1000039, 1000055]);
}

public function testAngleBetweenVectors()
{
$vector1 = [1, 2, 3];
$vector2 = [4, 5, 6];

$angle = $this->fm->angleBetweenVectors($vector1, $vector2);

$this->assertEquals(0.97463184619707621, $angle);
}

public function testHasCommonSubsequence()
{
$pattern1 = "tnsarh";
$pattern2 = "ntnsearch";

$res1 = $this->fm->hasCommonSubsequence($pattern1, 'tntsearch');
$res2 = $this->fm->hasCommonSubsequence($pattern2, 'tntsearch');

$this->assertEquals($res1, true);
$this->assertEquals($res2, false);
}

public function testMakeVectorSameLength()
{
$wordVector = $this->fm->wordToVector("tntsearch");
$patternVector = $this->fm->wordToVector("tnth");

$res = $this->fm->makeVectorSameLength($wordVector, $patternVector);
$this->assertEquals([1000054, 1000038, 1000054, 0, 0, 0, 0, 0, 1000026], $res);
}

public function testFuzzyMatchFromFile()
{
$res = $this->fm->fuzzyMatchFromFile('search', __DIR__.'/_files/english_wordlist_2k.txt');

$this->assertEquals([
'search' => 1,
'research' => 0.86602345529065
], $res);
}

public function testFuzzyMatchFromFileFunction()
{
$res = fuzzyMatchFromFile('search', __DIR__.'/_files/english_wordlist_2k.txt');

$this->assertEquals([
'search' => 1,
'research' => 0.86602345529065
], $res);
}

public function testFuzzyMatch()
{
$res = $this->fm->fuzzyMatch('search', ['search', 'research', 'something']);

$this->assertEquals([
'search' => 1,
'research' => 0.86602345529065
], $res);
}

public function testFuzzyMatchFunction()
{
$res = fuzzyMatch('search', ['search', 'research', 'something']);

$this->assertEquals([
'search' => 1,
'research' => 0.86602345529065
], $res);
}
}
Loading

0 comments on commit fdd9189

Please sign in to comment.