From 610c1aeb0dbe40ff4218acde73183b075a353412 Mon Sep 17 00:00:00 2001 From: Dan Gudmundsson Date: Thu, 16 Nov 2023 18:41:46 +0100 Subject: [PATCH] Add string:jaro_similarity/2 Calculate word similarity, can for example be used to provide potential alternatives in error messages. --- lib/stdlib/doc/src/string.xml | 24 ++++++++++ lib/stdlib/src/string.erl | 80 +++++++++++++++++++++++++++++++- lib/stdlib/test/string_SUITE.erl | 45 ++++++++++++++++-- 3 files changed, 144 insertions(+), 5 deletions(-) diff --git a/lib/stdlib/doc/src/string.xml b/lib/stdlib/doc/src/string.xml index 5176f6de6004..3c4815304582 100644 --- a/lib/stdlib/doc/src/string.xml +++ b/lib/stdlib/doc/src/string.xml @@ -244,6 +244,30 @@ true + + + Calculate the Jaro similarity of two strings. + +

Returns a float between +0.0 and 1.0 representing the + + Jaro similarity between the given strings. Strings with many letters + in common relative to their lengths will score closer to 1.0. +

+

The Jaro distance between two strings can be calculated with JaroDistance = 1.0-JaroSimilarity. +

+

Example:

+
+1> string:jaro_similarity("ditto", "ditto").
+1.0
+2> string:jaro_similarity("foo", "bar").
++0.0
+3> string:jaro_similarity("michelle", "michael").
+0.8690476190476191
+4> string:jaro_similarity(<<"Édouard"/utf8>>, <<"Claude">>).
+0.5317460317460317
+
+
+ Calculate length of the string. diff --git a/lib/stdlib/src/string.erl b/lib/stdlib/src/string.erl index 163bd8d0811a..02593e3cff86 100644 --- a/lib/stdlib/src/string.erl +++ b/lib/stdlib/src/string.erl @@ -58,6 +58,7 @@ prefix/2, split/2,split/3,replace/3,replace/4, find/2,find/3, + jaro_similarity/2, next_codepoint/1, next_grapheme/1 ]). @@ -85,7 +86,7 @@ -type grapheme_cluster() :: char() | [char()]. -type direction() :: 'leading' | 'trailing'. --dialyzer({no_improper_lists, [stack/2, length_b/3]}). +-dialyzer({no_improper_lists, [stack/2, length_b/3, str_to_map/2]}). %%% BIFs internal (not documented) should not to be used outside of this module %%% May be removed -export([list_to_float/1, list_to_integer/1]). @@ -563,6 +564,52 @@ find(String, SearchPattern, leading) -> find(String, SearchPattern, trailing) -> find_r(String, unicode:characters_to_list(SearchPattern), nomatch). +-spec jaro_similarity(String1, String2) -> Similarity when + String1 :: unicode:chardata(), + String2 :: unicode:chardata(), + Similarity :: float(). %% Between +0.0 and 1.0 +jaro_similarity(A0, B0) -> + {A, ALen} = str_to_gcl_and_length(A0), + {B, BLen} = str_to_indexmap(B0), + Dist = max(ALen, BLen) div 2, + {AM, BM} = jaro_match(A, B, -Dist, Dist, [], []), + if + ALen =:= 0 andalso BLen =:= 0 -> + 1.0; + ALen =:= 0 orelse BLen =:= 0 -> + 0.0; + AM =:= [] -> + 0.0; + true -> + {M,T} = jaro_calc_mt(AM, BM, 0, 0), + (M/ALen + M/BLen + (M-T/2)/M) / 3 + end. + +jaro_match([A|As], B0, Min, Max, AM, BM) -> + case jaro_detect(maps:get(A, B0, []), Min, Max) of + false -> + jaro_match(As, B0, Min+1, Max+1, AM, BM); + {J, Remain} -> + B = B0#{A => Remain}, + jaro_match(As, B, Min+1, Max+1, [A|AM], add_rsorted({J,A},BM)) + end; +jaro_match(_A, _B, _Min, _Max, AM, BM) -> + {AM, BM}. + +jaro_detect([Idx|Rest], Min, Max) when Min < Idx, Idx < Max -> + {Idx, Rest}; +jaro_detect([Idx|Rest], Min, Max) when Idx < Max -> + jaro_detect(Rest, Min, Max); +jaro_detect(_, _, _) -> + false. + +jaro_calc_mt([CharA|AM], [{_, CharA}|BM], M, T) -> + jaro_calc_mt(AM, BM, M+1, T); +jaro_calc_mt([_|AM], [_|BM], M, T) -> + jaro_calc_mt(AM, BM, M+1, T+1); +jaro_calc_mt([], [], M, T) -> + {M, T}. + %% Fetch first grapheme cluster and return rest in tail -spec next_grapheme(String::unicode:chardata()) -> maybe_improper_list(grapheme_cluster(),unicode:chardata()) | @@ -1795,6 +1842,37 @@ bin_search_str_2(Bin0, Start, Cont, First, SearchCPs) -> end. +%% Returns GC list and length +str_to_gcl_and_length(S0) -> + gcl_and_length(unicode_util:gc(S0), [], 0). + +gcl_and_length([C|Str], Acc, N) -> + gcl_and_length(unicode_util:gc(Str), [C|Acc], N+1); +gcl_and_length([], Acc, N) -> + {lists:reverse(Acc), N}; +gcl_and_length({error, Err}, _, _) -> + error({badarg, Err}). + +%% Returns GC map with index and length +str_to_indexmap(S) -> + [M|L] = str_to_map(unicode_util:gc(S), 0), + {M,L}. + +str_to_map([], L) -> [#{}|L]; +str_to_map([G | Gs], I) -> + [M|L] = str_to_map(unicode_util:gc(Gs), I+1), + [maps:put(G, [I | maps:get(G, M, [])], M)| L]; +str_to_map({error,Error}, _) -> + error({badarg, Error}). + +%% Add in decreasing order +add_rsorted(A, [H|_]=BM) when A > H -> + [A|BM]; +add_rsorted(A, [H|BM]) -> + [H|add_rsorted(A,BM)]; +add_rsorted(A, []) -> + [A]. + %%--------------------------------------------------------------------------- %% OLD lists API kept for backwards compability %%--------------------------------------------------------------------------- diff --git a/lib/stdlib/test/string_SUITE.erl b/lib/stdlib/test/string_SUITE.erl index 5a75c930dd68..4e33a4d47b64 100644 --- a/lib/stdlib/test/string_SUITE.erl +++ b/lib/stdlib/test/string_SUITE.erl @@ -36,7 +36,9 @@ uppercase/1, lowercase/1, titlecase/1, casefold/1, to_integer/1,to_float/1, prefix/1, split/1, replace/1, find/1, - lexemes/1, nth_lexeme/1, cd_gc/1, meas/1 + lexemes/1, nth_lexeme/1, cd_gc/1, + jaro_similarity/1, + meas/1 ]). -export([len/1,old_equal/1,old_concat/1,chr_rchr/1,str_rstr/1]). @@ -66,6 +68,7 @@ groups() -> to_integer, to_float, uppercase, lowercase, titlecase, casefold, prefix, find, split, replace, cd_gc, + jaro_similarity, meas]}, {list_string, [len, old_equal, old_concat, chr_rchr, str_rstr, span_cspan, @@ -788,6 +791,36 @@ nth_lexeme(_) -> ?TEST([<<"aae">>,778,"öeeåäö"], [2,"e"], "åäö"), ok. +jaro_similarity(_Config) -> + ?TEST("", [""], 1.0), + ?TEST("", [["", <<"">>]], 1.0), + %% From https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance#Jaro_similarity + ?TEST("faremviel", ["farmville"], 0.8842592592592592), + ?TEST("michelle", ["michael"], 0.8690476190476191), + ?TEST("michelle", [<<"michael">>], 0.8690476190476191), + ?TEST(<<"Édouard"/utf8>>, ["Claude"], 0.5317460317460317), + + + ?TEST("farmville", ["farmville"], 1.0), + ?TEST("farmville", ["zxzxzx"], +0.0), + + ?TEST("Saturday", ["Sunday"], 0.71944444), + ?TEST("Sunday", ["Saturday"], 0.71944444), + + %% Short strings (no translations counted) + ?TEST("ca", ["abc"], 0.0), + ?TEST("ca", ["cb"], ((1/2+1/2+1)/3)), + ?TEST("ca", ["cab"], ((2/2+2/3+1)/3)), + ?TEST("caa", ["cab"], ((2/3+2/3+1)/3)), + %% With one translation + ?TEST("caabx", ["caba"], ((4/5+4/4+((4-2/2)/4))/3)), + + InvalidUTF8 = <<192,192>>, + {'EXIT', {badarg, _}} = ?TRY(string:jaro_similarity("foo", InvalidUTF8)), + {'EXIT', {badarg, _}} = ?TRY(string:jaro_similarity("foo", <<$a, InvalidUTF8/binary, $z>>)), + + ok. + meas(Config) -> Parent = self(), @@ -956,7 +989,7 @@ test_1(Line, Func, Str, Args, Exp) -> check_types(Line, Func, Args, Res), case res(Res, Exp) of true -> ok; - {Res1,Exp1} when is_tuple(Exp1) -> + {Res1,Exp1} when is_tuple(Exp1); is_float(Exp1) -> io:format("~p~n",[Args]), io:format("~p:~p: ~ts~w =>~n :~w:~w~n", [Func,Line, Str,Str,Res1,Exp1]), @@ -999,6 +1032,8 @@ res({S1,S2}=S, {Exp1,Exp2}=E) -> %% For take {true, true} -> true; _ -> {S, E} end; +res(Float, Exp) when is_float(Exp) -> + abs(Float - Exp) < 0.0000001 orelse {Float, Exp}; res(Int, Exp) -> Int == Exp orelse {Int, Exp}. @@ -1007,8 +1042,10 @@ check_types(_Line, _Func, _Str, Res) when is_integer(Res); is_boolean(Res); Res =:= nomatch -> %% length or equal ok; -check_types(Line, Func, [S1,S2], Res) - when Func =:= concat -> +check_types(_Line, jaro_similarity, _Str, Res) + when is_float(Res) -> + ok; +check_types(Line, concat = Func, [S1,S2], Res) -> case check_types_1(type(S1),type(S2)) of ok -> case check_types_1(type(S1),type(Res)) of