Skip to content

Commit

Permalink
Add string:jaro_similarity/2
Browse files Browse the repository at this point in the history
Calculate word similarity, can for example be used to provide
potential alternatives in error messages.
  • Loading branch information
dgud committed Nov 22, 2023
1 parent fbe53ad commit 610c1ae
Show file tree
Hide file tree
Showing 3 changed files with 144 additions and 5 deletions.
24 changes: 24 additions & 0 deletions lib/stdlib/doc/src/string.xml
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,30 @@ true</pre>
</desc>
</func>

<func>
<name name="jaro_similarity" arity="2" since="OTP 27.0"/>
<fsummary>Calculate the Jaro similarity of two strings.</fsummary>
<desc>
<p>Returns a float between <c>+0.0</c> and <c>1.0</c> representing the
<url href="https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">
Jaro similarity</url> between the given strings. Strings with many letters
in common relative to their lengths will score closer to <c>1.0</c>.
</p>
<p>The Jaro distance between two strings can be calculated with <c>JaroDistance = 1.0-JaroSimilarity</c>.
</p>
<p><em>Example:</em></p>
<pre>
1> <input>string:jaro_similarity("ditto", "ditto").</input>
1.0
2> <input>string:jaro_similarity("foo", "bar").</input>
+0.0
3> <input>string:jaro_similarity("michelle", "michael").</input>
0.8690476190476191
4> <input>string:jaro_similarity(&lt;&lt;"Édouard"/utf8>>, &lt;&lt;"Claude">>).</input>
0.5317460317460317</pre>
</desc>
</func>

<func>
<name name="length" arity="1" since="OTP 20.0"/>
<fsummary>Calculate length of the string.</fsummary>
Expand Down
80 changes: 79 additions & 1 deletion lib/stdlib/src/string.erl
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
prefix/2,
split/2,split/3,replace/3,replace/4,
find/2,find/3,
jaro_similarity/2,
next_codepoint/1, next_grapheme/1
]).

Expand Down Expand Up @@ -85,7 +86,7 @@
-type grapheme_cluster() :: char() | [char()].
-type direction() :: 'leading' | 'trailing'.

-dialyzer({no_improper_lists, [stack/2, length_b/3]}).
-dialyzer({no_improper_lists, [stack/2, length_b/3, str_to_map/2]}).
%%% BIFs internal (not documented) should not to be used outside of this module
%%% May be removed
-export([list_to_float/1, list_to_integer/1]).
Expand Down Expand Up @@ -563,6 +564,52 @@ find(String, SearchPattern, leading) ->
find(String, SearchPattern, trailing) ->
find_r(String, unicode:characters_to_list(SearchPattern), nomatch).

-spec jaro_similarity(String1, String2) -> Similarity when
String1 :: unicode:chardata(),
String2 :: unicode:chardata(),
Similarity :: float(). %% Between +0.0 and 1.0
jaro_similarity(A0, B0) ->
{A, ALen} = str_to_gcl_and_length(A0),
{B, BLen} = str_to_indexmap(B0),
Dist = max(ALen, BLen) div 2,
{AM, BM} = jaro_match(A, B, -Dist, Dist, [], []),
if
ALen =:= 0 andalso BLen =:= 0 ->
1.0;
ALen =:= 0 orelse BLen =:= 0 ->
0.0;
AM =:= [] ->
0.0;
true ->
{M,T} = jaro_calc_mt(AM, BM, 0, 0),
(M/ALen + M/BLen + (M-T/2)/M) / 3
end.

jaro_match([A|As], B0, Min, Max, AM, BM) ->
case jaro_detect(maps:get(A, B0, []), Min, Max) of
false ->
jaro_match(As, B0, Min+1, Max+1, AM, BM);
{J, Remain} ->
B = B0#{A => Remain},
jaro_match(As, B, Min+1, Max+1, [A|AM], add_rsorted({J,A},BM))
end;
jaro_match(_A, _B, _Min, _Max, AM, BM) ->
{AM, BM}.

jaro_detect([Idx|Rest], Min, Max) when Min < Idx, Idx < Max ->
{Idx, Rest};
jaro_detect([Idx|Rest], Min, Max) when Idx < Max ->
jaro_detect(Rest, Min, Max);
jaro_detect(_, _, _) ->
false.

jaro_calc_mt([CharA|AM], [{_, CharA}|BM], M, T) ->
jaro_calc_mt(AM, BM, M+1, T);
jaro_calc_mt([_|AM], [_|BM], M, T) ->
jaro_calc_mt(AM, BM, M+1, T+1);
jaro_calc_mt([], [], M, T) ->
{M, T}.

%% Fetch first grapheme cluster and return rest in tail
-spec next_grapheme(String::unicode:chardata()) ->
maybe_improper_list(grapheme_cluster(),unicode:chardata()) |
Expand Down Expand Up @@ -1795,6 +1842,37 @@ bin_search_str_2(Bin0, Start, Cont, First, SearchCPs) ->
end.


%% Returns GC list and length
str_to_gcl_and_length(S0) ->
gcl_and_length(unicode_util:gc(S0), [], 0).

gcl_and_length([C|Str], Acc, N) ->
gcl_and_length(unicode_util:gc(Str), [C|Acc], N+1);
gcl_and_length([], Acc, N) ->
{lists:reverse(Acc), N};
gcl_and_length({error, Err}, _, _) ->
error({badarg, Err}).

%% Returns GC map with index and length
str_to_indexmap(S) ->
[M|L] = str_to_map(unicode_util:gc(S), 0),
{M,L}.

str_to_map([], L) -> [#{}|L];
str_to_map([G | Gs], I) ->
[M|L] = str_to_map(unicode_util:gc(Gs), I+1),
[maps:put(G, [I | maps:get(G, M, [])], M)| L];
str_to_map({error,Error}, _) ->
error({badarg, Error}).

%% Add in decreasing order
add_rsorted(A, [H|_]=BM) when A > H ->
[A|BM];
add_rsorted(A, [H|BM]) ->
[H|add_rsorted(A,BM)];
add_rsorted(A, []) ->
[A].

%%---------------------------------------------------------------------------
%% OLD lists API kept for backwards compability
%%---------------------------------------------------------------------------
Expand Down
45 changes: 41 additions & 4 deletions lib/stdlib/test/string_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@
uppercase/1, lowercase/1, titlecase/1, casefold/1,
to_integer/1,to_float/1,
prefix/1, split/1, replace/1, find/1,
lexemes/1, nth_lexeme/1, cd_gc/1, meas/1
lexemes/1, nth_lexeme/1, cd_gc/1,
jaro_similarity/1,
meas/1
]).

-export([len/1,old_equal/1,old_concat/1,chr_rchr/1,str_rstr/1]).
Expand Down Expand Up @@ -66,6 +68,7 @@ groups() ->
to_integer, to_float,
uppercase, lowercase, titlecase, casefold,
prefix, find, split, replace, cd_gc,
jaro_similarity,
meas]},
{list_string,
[len, old_equal, old_concat, chr_rchr, str_rstr, span_cspan,
Expand Down Expand Up @@ -788,6 +791,36 @@ nth_lexeme(_) ->
?TEST([<<"aae">>,778,"öeeåäö"], [2,"e"], "åäö"),
ok.

jaro_similarity(_Config) ->
?TEST("", [""], 1.0),
?TEST("", [["", <<"">>]], 1.0),
%% From https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance#Jaro_similarity
?TEST("faremviel", ["farmville"], 0.8842592592592592),
?TEST("michelle", ["michael"], 0.8690476190476191),
?TEST("michelle", [<<"michael">>], 0.8690476190476191),
?TEST(<<"Édouard"/utf8>>, ["Claude"], 0.5317460317460317),


?TEST("farmville", ["farmville"], 1.0),
?TEST("farmville", ["zxzxzx"], +0.0),

?TEST("Saturday", ["Sunday"], 0.71944444),
?TEST("Sunday", ["Saturday"], 0.71944444),

%% Short strings (no translations counted)
?TEST("ca", ["abc"], 0.0),
?TEST("ca", ["cb"], ((1/2+1/2+1)/3)),
?TEST("ca", ["cab"], ((2/2+2/3+1)/3)),
?TEST("caa", ["cab"], ((2/3+2/3+1)/3)),
%% With one translation
?TEST("caabx", ["caba"], ((4/5+4/4+((4-2/2)/4))/3)),

InvalidUTF8 = <<192,192>>,
{'EXIT', {badarg, _}} = ?TRY(string:jaro_similarity("foo", InvalidUTF8)),
{'EXIT', {badarg, _}} = ?TRY(string:jaro_similarity("foo", <<$a, InvalidUTF8/binary, $z>>)),

ok.


meas(Config) ->
Parent = self(),
Expand Down Expand Up @@ -956,7 +989,7 @@ test_1(Line, Func, Str, Args, Exp) ->
check_types(Line, Func, Args, Res),
case res(Res, Exp) of
true -> ok;
{Res1,Exp1} when is_tuple(Exp1) ->
{Res1,Exp1} when is_tuple(Exp1); is_float(Exp1) ->
io:format("~p~n",[Args]),
io:format("~p:~p: ~ts~w =>~n :~w:~w~n",
[Func,Line, Str,Str,Res1,Exp1]),
Expand Down Expand Up @@ -999,6 +1032,8 @@ res({S1,S2}=S, {Exp1,Exp2}=E) -> %% For take
{true, true} -> true;
_ -> {S, E}
end;
res(Float, Exp) when is_float(Exp) ->
abs(Float - Exp) < 0.0000001 orelse {Float, Exp};
res(Int, Exp) ->
Int == Exp orelse {Int, Exp}.

Expand All @@ -1007,8 +1042,10 @@ check_types(_Line, _Func, _Str, Res)
when is_integer(Res); is_boolean(Res); Res =:= nomatch ->
%% length or equal
ok;
check_types(Line, Func, [S1,S2], Res)
when Func =:= concat ->
check_types(_Line, jaro_similarity, _Str, Res)
when is_float(Res) ->
ok;
check_types(Line, concat = Func, [S1,S2], Res) ->
case check_types_1(type(S1),type(S2)) of
ok ->
case check_types_1(type(S1),type(Res)) of
Expand Down

0 comments on commit 610c1ae

Please sign in to comment.