Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add benchmark project, some small optimizations #39

Merged
merged 5 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 117 additions & 0 deletions F23.StringSimilarity.Benchmarks/Benchmarks.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
using BenchmarkDotNet.Attributes;

namespace F23.StringSimilarity.Benchmarks;

[MemoryDiagnoser]
public class Benchmarks
{
[Benchmark]
public void Cosine()
{
var cosine = new Cosine();
_ = cosine.Distance("hello", "world");
}

[Benchmark]
public void Damerau()
{
var damerau = new Damerau();
_ = damerau.Distance("hello", "world");
}

[Benchmark]
public void Jaccard()
{
var jaccard = new Jaccard();
_ = jaccard.Distance("hello", "world");
}

[Benchmark]
public void JaroWinkler()
{
var jaro = new JaroWinkler();
_ = jaro.Distance("hello", "world");
}

[Benchmark]
public void Levenshtein()
{
var levenshtein = new Levenshtein();
_ = levenshtein.Distance("hello", "world");
}

[Benchmark]
public void LongestCommonSubsequence()
{
var lcs = new LongestCommonSubsequence();
_ = lcs.Distance("hello", "world");
}

[Benchmark]
public void MetricLCS()
{
var metricLcs = new MetricLCS();
_ = metricLcs.Distance("hello", "world");
}

[Benchmark]
public void NGram()
{
var ngram = new NGram();
_ = ngram.Distance("hello", "world");
}

[Benchmark]
public void NormalizedLevenshtein()
{
var normalizedLevenshtein = new NormalizedLevenshtein();
_ = normalizedLevenshtein.Distance("hello", "world");
}

[Benchmark]
public void OptimalStringAlignment()
{
var osa = new OptimalStringAlignment();
_ = osa.Distance("hello", "world");
}

[Benchmark]
public void QGram()
{
var qGram = new QGram();
_ = qGram.Distance("hello", "world");
}

[Benchmark]
public void RatcliffObershelp()
{
var ratcliffObershelp = new RatcliffObershelp();
_ = ratcliffObershelp.Distance("hello", "world");
}

[Benchmark]
public void SorensenDice()
{
var sorensenDice = new SorensenDice();
_ = sorensenDice.Distance("hello", "world");
}

[Benchmark]
public void WeightedLevenshtein()
{
var weightedLevenshtein = new WeightedLevenshtein(new ExampleCharSub());
_ = weightedLevenshtein.Distance("hello", "world");
}

private class ExampleCharSub : ICharacterSubstitution
{
public double Cost(char c1, char c2)
{
// The cost for substituting 't' and 'r' is considered smaller as these 2 are located next to each other on a keyboard
if (c1 == 't' && c2 == 'r') return 0.5;

// For most cases, the cost of substituting 2 characters is 1.0
return 1.0;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="BenchmarkDotNet" Version="0.13.12" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\src\F23.StringSimilarity\F23.StringSimilarity.csproj" />
</ItemGroup>

</Project>
4 changes: 4 additions & 0 deletions F23.StringSimilarity.Benchmarks/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
using BenchmarkDotNet.Running;
using F23.StringSimilarity.Benchmarks;

BenchmarkRunner.Run<Benchmarks>();
6 changes: 6 additions & 0 deletions F23.StringSimilarity.sln
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "F23.StringSimilarity", "src
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "F23.StringSimilarity.Tests", "test\F23.StringSimilarity.Tests\F23.StringSimilarity.Tests.csproj", "{68F339E6-278F-4B04-A6ED-422AAD30591F}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "F23.StringSimilarity.Benchmarks", "F23.StringSimilarity.Benchmarks\F23.StringSimilarity.Benchmarks.csproj", "{3A9605B1-820C-43C2-8F9B-72BCA5F5543B}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand All @@ -21,6 +23,10 @@ Global
{68F339E6-278F-4B04-A6ED-422AAD30591F}.Debug|Any CPU.Build.0 = Debug|Any CPU
{68F339E6-278F-4B04-A6ED-422AAD30591F}.Release|Any CPU.ActiveCfg = Release|Any CPU
{68F339E6-278F-4B04-A6ED-422AAD30591F}.Release|Any CPU.Build.0 = Release|Any CPU
{3A9605B1-820C-43C2-8F9B-72BCA5F5543B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{3A9605B1-820C-43C2-8F9B-72BCA5F5543B}.Debug|Any CPU.Build.0 = Debug|Any CPU
{3A9605B1-820C-43C2-8F9B-72BCA5F5543B}.Release|Any CPU.ActiveCfg = Release|Any CPU
{3A9605B1-820C-43C2-8F9B-72BCA5F5543B}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down
10 changes: 5 additions & 5 deletions src/F23.StringSimilarity/Jaccard.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

using System;
using System.Collections.Generic;
using System.Linq;
using F23.StringSimilarity.Interfaces;

// ReSharper disable LoopCanBeConvertedToQuery
Expand Down Expand Up @@ -83,14 +84,13 @@ public double Similarity(string s1, string s2)
var profile1 = GetProfile(s1);
var profile2 = GetProfile(s2);

var union = new HashSet<string>();
union.UnionWith(profile1.Keys);
union.UnionWith(profile2.Keys);
// SSNET Specific: use LINQ for more optimal distinct count
var unionCount = profile1.Keys.Concat(profile2.Keys).Distinct().Count();

int inter = profile1.Keys.Count + profile2.Keys.Count
- union.Count;
- unionCount;

return 1.0 * inter / union.Count;
return 1.0 * inter / unionCount;
}


Expand Down
6 changes: 2 additions & 4 deletions src/F23.StringSimilarity/Levenshtein.cs
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ public double Distance<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2, int limit)
// create two work vectors of integer distances
int[] v0 = new int[s2.Length + 1];
int[] v1 = new int[s2.Length + 1];
int[] vtemp;
// SSNET: removed unneeded int[] vtemp;

// initialize v0 (the previous row of distances)
// this row is A[0][i]: edit distance for an empty s
Expand Down Expand Up @@ -155,9 +155,7 @@ public double Distance<T>(ReadOnlySpan<T> s1, ReadOnlySpan<T> s2, int limit)
// System.arraycopy(v1, 0, v0, 0, v0.length);

// Flip references to current and previous row
vtemp = v0;
v0 = v1;
v1 = vtemp;
(v0, v1) = (v1, v0); // SSNET specific: Swap v0 and v1 using tuples
}

return v0[s2.Length];
Expand Down
6 changes: 2 additions & 4 deletions src/F23.StringSimilarity/NGram.cs
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ public double Distance(string s0, string s1)
char[] sa = new char[sl + n - 1];
float[] p; // 'previous' cost array, horizontally
float[] d; // Cost array, horizontally
float[] d2; // Placeholder to assist in swapping p and d
// SSNET removed unneeded: float[] d2; // Placeholder to assist in swapping p and d

// Construct sa with prefix
for (int i1 = 0; i1 < sa.Length; i1++)
Expand Down Expand Up @@ -172,9 +172,7 @@ public double Distance(string s0, string s1)
d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1] + ec);
}
// Copy current distance counts to 'previous row' distance counts
d2 = p;
p = d;
d = d2;
(p, d) = (d, p); // SSNET specific: swap p and d using tuples
}

// Our last action in the above loop was to switch d and p, so p now
Expand Down
6 changes: 3 additions & 3 deletions src/F23.StringSimilarity/ShingleBased.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public abstract class ShingleBased
/// <summary>
/// Pattern for finding multiple following spaces
/// </summary>
private static readonly Regex SPACE_REG = new Regex("\\s+");
private static readonly Regex SPACE_REG = new Regex("\\s+", RegexOptions.Compiled);

/// <summary>
/// </summary>
Expand All @@ -59,7 +59,7 @@ protected ShingleBased(int k)

protected ShingleBased() : this(DEFAULT_K) { }

public IDictionary<string, int> GetProfile(string s)
protected internal Dictionary<string, int> GetProfile(string s)
{
var shingles = new Dictionary<string, int>();

Expand All @@ -79,7 +79,7 @@ public IDictionary<string, int> GetProfile(string s)
}
}

return new ReadOnlyDictionary<string, int>(shingles);
return shingles;
}
}
}
7 changes: 3 additions & 4 deletions src/F23.StringSimilarity/WeightedLevenshtein.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
*/

using System;
using System.Threading;
using F23.StringSimilarity.Interfaces;
// ReSharper disable SuggestVarOrType_Elsewhere
// ReSharper disable TooWideLocalVariableScope
Expand Down Expand Up @@ -112,7 +113,7 @@ public double Distance(string s1, string s2, double limit)
// create two work vectors of floating point (i.e. weighted) distances
double[] v0 = new double[s2.Length + 1];
double[] v1 = new double[s2.Length + 1];
double[] vtemp;
// SSNET: removed unneeded double[] vtemp;

// initialize v0 (the previous row of distances)
// this row is A[0][i]: edit distance for an empty s1
Expand Down Expand Up @@ -166,9 +167,7 @@ public double Distance(string s1, string s2, double limit)
// copy v1 (current row) to v0 (previous row) for next iteration
// System.arraycopy(v1, 0, v0, 0, v0.length);
// Flip references to current and previous row
vtemp = v0;
v0 = v1;
v1 = vtemp;
(v0, v1) = (v1, v0); // SSNET Specific: Swap references using tuples instead of temporary
}

return v0[s2.Length];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@
<ProjectReference Include="..\..\src\F23.StringSimilarity\F23.StringSimilarity.csproj" />
</ItemGroup>

</Project>
</Project>
Loading