diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..3163b36 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,15 @@ +# See: https://docs.microsoft.com/en-us/dotnet/fundamentals/code-analysis/style-rules/formatting-rules + +[*.cs] +indent_style=tab +indent_size=tab +tab_width=4 + +[*.{appxmanifest,axml,build,config,csproj,dbml,discomap,dtd,json,jsproj,lsproj,njsproj,nuspec,proj,props,resjson,resw,resx,StyleCop,targets,tasks,vbproj,xml,xsd}] +indent_style=space +indent_size=2 +tab_width=2 + +[*.cs] +csharp_prefer_braces = true +csharp_new_line_before_open_brace = methods, properties, control_blocks, types \ No newline at end of file diff --git a/LunrCore.sln b/LunrCore.sln index a66e34a..5f0751e 100644 --- a/LunrCore.sln +++ b/LunrCore.sln @@ -9,6 +9,7 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LunrCoreTests", "LunrCoreTe EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Metadata", "Metadata", "{333885CB-3ADF-4462-8910-27A16B6C5F55}" ProjectSection(SolutionItems) = preProject + .editorconfig = .editorconfig CODE_OF_CONDUCT.md = CODE_OF_CONDUCT.md LICENSE = LICENSE README.md = README.md @@ -20,7 +21,7 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LunrCoreLmdb", "LunrCoreLmd EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LunrCoreLmdbTests", "LunrCoreLmdbTests\LunrCoreLmdbTests.csproj", "{2EF25270-1D0D-4450-ADBB-BCFD08FB9BB6}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LunrCoreLmdbPerf", "LunrCoreLmdbPerf\LunrCoreLmdbPerf.csproj", "{41BB51FD-462C-4AAB-9B4D-127FD784B566}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LunrCoreLmdbPerf", "LunrCoreLmdbPerf\LunrCoreLmdbPerf.csproj", "{41BB51FD-462C-4AAB-9B4D-127FD784B566}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/LunrCore/Globalization/Among.cs b/LunrCore/Globalization/Among.cs new file mode 100644 index 0000000..5c58292 --- /dev/null +++ b/LunrCore/Globalization/Among.cs @@ -0,0 +1,57 @@ +using System; +using System.Linq; + +namespace Lunr.Globalization +{ + public readonly struct Among : IEquatable + { + public int Size { get; } + public char[] StringArray { get; } + public int Result { get; } + public Func? Method { get; } + public int Substring { get; } + + public Among(string s, int substring, int result, Func method = default!) + { + if (s == null) + { + throw new ArgumentNullException(nameof(s)); + } + + Size = s.Length; + StringArray = s.ToCharArray(); + Substring = substring; + Result = result; + Method = method; + } + + public bool Equals(Among other) + { + return Size == other.Size && + StringArray.SequenceEqual(other.StringArray) && + Result == other.Result && + Method == other.Method && + Substring == other.Substring; + } + + public override bool Equals(object? obj) + { + return obj is Among other && Equals(other); + } + + public override int GetHashCode() + { + return (Size, StringArray, Result, Method, Substring).GetHashCode(); + } + + public static bool operator ==(Among left, Among right) + { + return left.Equals(right); + } + + public static bool operator !=(Among left, Among right) + { + return !left.Equals(right); + } + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/SnowballProgram.cs b/LunrCore/Globalization/SnowballProgram.cs new file mode 100644 index 0000000..3690e6e --- /dev/null +++ b/LunrCore/Globalization/SnowballProgram.cs @@ -0,0 +1,386 @@ +using System; + +namespace Lunr.Globalization +{ + internal sealed class SnowballProgram + { + internal int bra; + private string current; + + internal int cursor; + internal int ket; + internal int limit; + internal int limit_backward; + + public void SetCurrent(string word) + { + current = word; + cursor = 0; + limit = word.Length; + limit_backward = 0; + bra = cursor; + ket = limit; + } + + public string GetCurrent() + { + var result = current; + current = null!; + return result; + } + + public bool InGrouping(int[] s, int min, int max) + { + if (cursor < limit) + { + var ch = (int) current[cursor]; + if (ch <= max && ch >= min) + { + ch -= min; + var r = s[ch >> 3] & (0X1 << (ch & 0X7)); + if (r == 0) + { + cursor++; + return true; + } + } + } + + return false; + } + + public bool InGroupingBackwards(int[] s, int min, int max) + { + if (cursor > limit_backward) + { + var ch = (int) current[cursor - 1]; + if (ch <= max && ch >= min) + { + ch -= min; + var r = s[ch >> 3] & (0X1 << (ch & 0X7)); + if (r == 0) + { + cursor--; + return true; + } + } + } + + return false; + } + + public bool OutGrouping(int[] s, int min, int max) + { + if (cursor < limit) + { + var ch = (int) current[cursor]; + if (ch > max || ch < min) + { + cursor++; + return true; + } + + ch -= min; + var r = s[ch >> 3] & (0X1 << (ch & 0X7)); + if (r != 0) + { + cursor++; + return true; + } + } + + return false; + } + + public bool OutGroupingBackwards(int[] s, int min, int max) + { + if (cursor > limit_backward) + { + var ch = (int) current[cursor - 1]; + if (ch > max || ch < min) + { + cursor--; + return true; + } + + ch -= min; + var r = s[ch >> 3] & (0X1 << (ch & 0X7)); + if (r != 0) + { + cursor--; + return true; + } + } + + return false; + } + + public bool EqualsSegment(int s_size, string s) + { + if (limit - cursor < s_size) + { + return false; + } + + for (var i = 0; i < s_size; i++) + { + if (current[cursor + i] != s[i]) + { + return false; + } + } + + cursor += s_size; + return true; + } + + public bool EqualsSegmentBackwards(int s_size, string s) + { + if (cursor - limit_backward < s_size) + { + return false; + } + + for (var i = 0; i < s_size; i++) + { + if (current[cursor - s_size + i] != s[i]) + { + return false; + } + } + + cursor -= s_size; + return true; + } + + public int FindAmong(Among[] v, int v_size) + { + var i = 0; + var j = v_size; + var c = cursor; + var l = limit; + var common_i = 0; + var common_j = 0; + var first_key_inspected = false; + + while (true) + { + var k = i + ((j - i) >> 1); + var diff = 0; + var common = common_i < common_j ? common_i : common_j; + var w = v[k]; + + for (var i2 = common; i2 < w.Size; i2++) + { + if (c + common == l) + { + diff = -1; + break; + } + + diff = current[c + common] - w.StringArray[i2]; + if (diff != 0) /* !! */ + { + break; + } + + common++; + } + + if (diff < 0) + { + j = k; + common_j = common; + } + else + { + i = k; + common_i = common; + } + + if (j - i <= 1) + { + if (i > 0 || j == i || first_key_inspected) + { + break; + } + + first_key_inspected = true; + } + } + + while (true) + { + var w = v[i]; + if (common_i >= w.Size) + { + cursor = c + w.Size; + if (w.Method == null) + { + return w.Result; + } + + var res = w.Method(); + cursor = c + w.Size; + if (res) + { + return w.Result; + } + } + + i = w.Substring; + if (i < 0) + { + return 0; + } + } + } + + public int FindAmongBackwards(Among[] v, int v_size) + { + var i = 0; + var j = v_size; + var c = cursor; + var lb = limit_backward; + var common_i = 0; + var common_j = 0; + var first_key_inspected = false; + + while (true) + { + var k = i + ((j - i) >> 1); + var diff = 0; + var common = common_i < common_j ? common_i : common_j; + var w = v[k]; + + for (var i2 = w.Size - 1 - common; i2 >= 0; i2--) + { + if (c - common == lb) + { + diff = -1; + break; + } + + diff = current[c - 1 - common] - w.StringArray[i2]; + if (diff != 0) /* !! */ + { + break; + } + + common++; + } + + if (diff < 0) + { + j = k; + common_j = common; + } + else + { + i = k; + common_i = common; + } + + if (j - i <= 1) + { + if (i > 0 || j == i || first_key_inspected) + { + break; + } + + first_key_inspected = true; + } + } + + while (true) + { + var w = v[i]; + if (common_i >= w.Size) + { + cursor = c - w.Size; + if (w.Method == null) + { + return w.Result; + } + + var res = w.Method(); + cursor = c - w.Size; + if (res) + { + return w.Result; + } + } + + i = w.Substring; + if (i < 0) + { + return 0; + } + } + } + + public int ReplaceSegment(int c_bra, int c_ket, string s) + { + var adjustment = s.Length - (c_ket - c_bra); + var left = current.Substring(0, c_bra); + var right = current.Substring(c_ket); + + current = left + s + right; + limit += adjustment; + if (cursor >= c_ket) + { + cursor += adjustment; + } + else if (cursor > c_bra) + { + cursor = c_bra; + } + + return adjustment; + } + + public void SliceCheck() + { + if (bra < 0 || bra > ket || ket > limit || limit > current?.Length) + { + throw new InvalidOperationException("faulty slice operation"); + } + } + + public void SliceFrom(string s) + { + SliceCheck(); + ReplaceSegment(bra, ket, s); + } + + public void SliceDelete() + { + SliceFrom(""); + } + + public void Insert(int c_bra, int c_ket, string s) + { + var adjustment = ReplaceSegment(c_bra, c_ket, s); + if (c_bra <= bra) + { + bra += adjustment; + } + + if (c_bra <= ket) + { + ket += adjustment; + } + } + + public string SliceTo() + { + SliceCheck(); + return current.Substring(bra, ket); + } + + public bool EqualsValueBackwards(string s) + { + return EqualsSegmentBackwards(s.Length, s); + } + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/WordCharacters.cs b/LunrCore/Globalization/WordCharacters.cs new file mode 100644 index 0000000..8200457 --- /dev/null +++ b/LunrCore/Globalization/WordCharacters.cs @@ -0,0 +1,26 @@ +namespace Lunr.Globalization +{ + internal static class WordCharacters + { + public const string Arabic = "، اض امين اه اها اي ا اب اجل اجمع اخ اخذ اصبح اضحى اقبل اقل اكثر الا ام اما امامك امامك امسى اما ان انا انت انتم انتما انتن انت انشا انى او اوشك اولئك اولئكم اولاء اولالك اوه اي ايا اين اينما اي ان اي اف اذ اذا اذا اذما اذن الى اليكم اليكما اليكن اليك اليك الا اما ان انما اي اياك اياكم اياكما اياكن ايانا اياه اياها اياهم اياهما اياهن اياي ايه ان ا ابتدا اثر اجل احد اخرى اخلولق اذا اربعة ارتد استحال اطار اعادة اعلنت اف اكثر اكد الالاء الالى الا الاخيرة الان الاول الاولى التى التي الثاني الثانية الذاتي الذى الذي الذين السابق الف اللائي اللاتي اللتان اللتيا اللتين اللذان اللذين اللواتي الماضي المقبل الوقت الى اليوم اما امام امس ان انبرى انقلب انه انها او اول اي ايار ايام ايضا ب بات باسم بان بخ برس بسبب بس بشكل بضع بطان بعد بعض بك بكم بكما بكن بل بلى بما بماذا بمن بن بنا به بها بي بيد بين بس بله بئس تان تانك تبدل تجاه تحول تلقاء تلك تلكم تلكما تم تينك تين ته تي ثلاثة ثم ثم ثمة ثم جعل جلل جميع جير حار حاشا حاليا حاي حتى حرى حسب حم حوالى حول حيث حيثما حين حي حبذا حتى حذار خلا خلال دون دونك ذا ذات ذاك ذانك ذان ذلك ذلكم ذلكما ذلكن ذو ذوا ذواتا ذواتي ذيت ذينك ذين ذه ذي راح رجع رويدك ريث رب زيارة سبحان سرعان سنة سنوات سوف سوى ساء ساءما شبه شخصا شرع شتان صار صباح صفر صه صه ضد ضمن طاق طالما طفق طق ظل عاد عام عاما عامة عدا عدة عدد عدم عسى عشر عشرة علق على عليك عليه عليها عل عن عند عندما عوض عين عدس عما غدا غير ف فان فلان فو فى في فيم فيما فيه فيها قال قام قبل قد قط قلما قوة كانما كاين كاي كاين كاد كان كانت كذا كذلك كرب كل كلا كلاهما كلتا كلم كليكما كليهما كلما كلا كم كما كي كيت كيف كيفما كان كخ لئن لا لات لاسيما لدن لدى لعمر لقاء لك لكم لكما لكن لكنما لكي لكيلا للامم لم لما لما لن لنا له لها لو لوكالة لولا لوما لي لست لست لستم لستما لستن لست لسن لعل لكن ليت ليس ليسا ليستا ليست ليسوا لسنا ما ماانفك مابرح مادام ماذا مازال مافتئ مايو متى مثل مذ مساء مع معاذ مقابل مكانكم مكانكما مكانكن مكانك مليار مليون مما ممن من منذ منها مه مهما من من نحن نحو نعم نفس نفسه نهاية نخ نعما نعم ها هاؤم هاك هاهنا هب هذا هذه هكذا هل هلم هلا هم هما هن هنا هناك هنالك هو هي هيا هيت هيا هؤلاء هاتان هاتين هاته هاتي هج هذا هذان هذين هذه هذي هيهات وا واحد واضاف واضافت واكد وان واها واوضح وراءك وفي وقال وقالت وقد وقف وكان وكانت ولا ولم ومن وهو وهي ويكان وي وشكان يكون يمكن يوم ايان"; + public const string Danish = Latin; + public const string Dutch = Latin; + public const string Spanish = Latin; + public const string Finnish = Latin; + public const string French = Latin; + public const string German = Latin; + public const string Hungarian = Latin; + public const string Italian = Latin; + public const string Japanese = "一二三四五六七八九十百千万億兆一-龠々〆ヵヶぁ-んァ-ヴーア-ン゙a-zA-Za-zA-Z0-90-9"; + public const string Norwegian = Latin; + public const string Portuguese = Latin; + public const string Romanian = Latin; + public const string Russian = @"Ѐ-҄҇-ԯᴫᵸⷠ-ⷿꙀ-ꚟ︮︯"; + public const string Swedish = Latin; + public const string Thai = @"[฀-๿]"; + public const string Turkish = Latin; + public const string Vietnamese = @"[A-Za-ẓ̀͐́͑̉̃̓ÂâÊêÔôĂ-ăĐ-đƠ-ơƯ-ư]"; + + private const string Latin = @"A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤff-stA-Za-z"; + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/ar/ArabicStopWordFilter.cs b/LunrCore/Globalization/ar/ArabicStopWordFilter.cs new file mode 100644 index 0000000..cbbe705 --- /dev/null +++ b/LunrCore/Globalization/ar/ArabicStopWordFilter.cs @@ -0,0 +1,15 @@ +using System; + +namespace Lunr.Globalization.ar +{ + public sealed class ArabicStopWordFilter : StopWordFilterBase + { + private const string Data = + @"، اض امين اه اها اي ا اب اجل اجمع اخ اخذ اصبح اضحى اقبل اقل اكثر الا ام اما امامك امامك امسى اما ان انا انت انتم انتما انتن انت انشا انى او اوشك اولئك اولئكم اولاء اولالك اوه اي ايا اين اينما اي ان اي اف اذ اذا اذا اذما اذن الى اليكم اليكما اليكن اليك اليك الا اما ان انما اي اياك اياكم اياكما اياكن ايانا اياه اياها اياهم اياهما اياهن اياي ايه ان ا ابتدا اثر اجل احد اخرى اخلولق اذا اربعة ارتد استحال اطار اعادة اعلنت اف اكثر اكد الالاء الالى الا الاخيرة الان الاول الاولى التى التي الثاني الثانية الذاتي الذى الذي الذين السابق الف اللائي اللاتي اللتان اللتيا اللتين اللذان اللذين اللواتي الماضي المقبل الوقت الى اليوم اما امام امس ان انبرى انقلب انه انها او اول اي ايار ايام ايضا ب بات باسم بان بخ برس بسبب بس بشكل بضع بطان بعد بعض بك بكم بكما بكن بل بلى بما بماذا بمن بن بنا به بها بي بيد بين بس بله بئس تان تانك تبدل تجاه تحول تلقاء تلك تلكم تلكما تم تينك تين ته تي ثلاثة ثم ثم ثمة ثم جعل جلل جميع جير حار حاشا حاليا حاي حتى حرى حسب حم حوالى حول حيث حيثما حين حي حبذا حتى حذار خلا خلال دون دونك ذا ذات ذاك ذانك ذان ذلك ذلكم ذلكما ذلكن ذو ذوا ذواتا ذواتي ذيت ذينك ذين ذه ذي راح رجع رويدك ريث رب زيارة سبحان سرعان سنة سنوات سوف سوى ساء ساءما شبه شخصا شرع شتان صار صباح صفر صه صه ضد ضمن طاق طالما طفق طق ظل عاد عام عاما عامة عدا عدة عدد عدم عسى عشر عشرة علق على عليك عليه عليها عل عن عند عندما عوض عين عدس عما غدا غير ف فان فلان فو فى في فيم فيما فيه فيها قال قام قبل قد قط قلما قوة كانما كاين كاي كاين كاد كان كانت كذا كذلك كرب كل كلا كلاهما كلتا كلم كليكما كليهما كلما كلا كم كما كي كيت كيف كيفما كان كخ لئن لا لات لاسيما لدن لدى لعمر لقاء لك لكم لكما لكن لكنما لكي لكيلا للامم لم لما لما لن لنا له لها لو لوكالة لولا لوما لي لست لست لستم لستما لستن لست لسن لعل لكن ليت ليس ليسا ليستا ليست ليسوا لسنا ما ماانفك مابرح مادام ماذا مازال مافتئ مايو متى مثل مذ مساء مع معاذ مقابل مكانكم مكانكما مكانكن مكانك مليار مليون مما ممن من منذ منها مه مهما من من نحن نحو نعم نفس نفسه نهاية نخ نعما نعم ها هاؤم هاك هاهنا هب هذا هذه هكذا هل هلم هلا هم هما هن هنا هناك هنالك هو هي هيا هيت هيا هؤلاء هاتان هاتين هاته هاتي هج هذا هذان هذين هذه هذي هيهات و وا واحد واضاف واضافت واكد وان واها واوضح وراءك وفي وقال وقالت وقد وقف وكان وكانت ولا ولم ومن وهو وهي ويكان وي وشكان يكون يمكن يوم ايان"; + + private static readonly ISet WordList = + new Set(Data.Split(new[] {" "}, StringSplitOptions.RemoveEmptyEntries)); + + protected override ISet StopWords => WordList; + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/ar/ArabicTrimmer.cs b/LunrCore/Globalization/ar/ArabicTrimmer.cs new file mode 100644 index 0000000..d6e079d --- /dev/null +++ b/LunrCore/Globalization/ar/ArabicTrimmer.cs @@ -0,0 +1,10 @@ +using System.Text.RegularExpressions; + +namespace Lunr.Globalization.ar +{ + public sealed class ArabicTrimmer : TrimmerBase + { + private static readonly Regex Pattern = new Regex("(^[^" + WordCharacters.Arabic + "]+|[^" + WordCharacters.Arabic + "]+$)", RegexOptions.Compiled); + public override string Trim(string s) => Pattern.Replace(s, ""); + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/da/DanishStopWordFilter.cs b/LunrCore/Globalization/da/DanishStopWordFilter.cs new file mode 100644 index 0000000..804e429 --- /dev/null +++ b/LunrCore/Globalization/da/DanishStopWordFilter.cs @@ -0,0 +1,15 @@ +using System; + +namespace Lunr.Globalization.da +{ + public sealed class DanishStopWordFilter : StopWordFilterBase + { + private const string Data = + @"ad af alle alt anden at blev blive bliver da de dem den denne der deres det dette dig din disse dog du efter eller en end er et for fra ham han hans har havde have hende hendes her hos hun hvad hvis hvor i ikke ind jeg jer jo kunne man mange med meget men mig min mine mit mod ned noget nogle nu når og også om op os over på selv sig sin sine sit skal skulle som sådan thi til ud under var vi vil ville vor være været"; + + private static readonly ISet WordList = + new Set(Data.Split(new[] {" "}, StringSplitOptions.RemoveEmptyEntries)); + + protected override ISet StopWords => WordList; + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/da/DanishTrimmer.cs b/LunrCore/Globalization/da/DanishTrimmer.cs new file mode 100644 index 0000000..bb56335 --- /dev/null +++ b/LunrCore/Globalization/da/DanishTrimmer.cs @@ -0,0 +1,10 @@ +using System.Text.RegularExpressions; + +namespace Lunr.Globalization.da +{ + public sealed class DanishTrimmer : TrimmerBase + { + private static readonly Regex Pattern = new Regex("(^[^" + WordCharacters.Danish + "]+|[^" + WordCharacters.Danish + "]+$)", RegexOptions.Compiled); + public override string Trim(string s) => Pattern.Replace(s, ""); + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/de/GermanStemmer.cs b/LunrCore/Globalization/de/GermanStemmer.cs new file mode 100644 index 0000000..6e2a49b --- /dev/null +++ b/LunrCore/Globalization/de/GermanStemmer.cs @@ -0,0 +1,409 @@ +namespace Lunr.Globalization.de +{ + public sealed class GermanStemmer : StemmerBase + { + private readonly SnowballProgram sbp; + private int I_p1; + private int I_p2; + private int I_x; + + public GermanStemmer() + { + sbp = new SnowballProgram(); + I_x = 0; + I_p2 = 0; + I_p1 = 0; + } + + public override string Stem(string w) + { + sbp.SetCurrent(w); + StemImpl(); + return sbp.GetCurrent(); + } + + private bool habr1(string c1, string c2, int v_1) + { + if (sbp.EqualsSegment(1, c1)) + { + sbp.ket = sbp.cursor; + if (sbp.InGrouping(g_v, 97, 252)) + { + sbp.SliceFrom(c2); + sbp.cursor = v_1; + return true; + } + } + + return false; + } + + private void r_prelude() + { + var v_1 = sbp.cursor; + while (true) + { + var v_2 = sbp.cursor; + sbp.bra = v_2; + if (sbp.EqualsSegment(1, "ß")) + { + sbp.ket = sbp.cursor; + sbp.SliceFrom("ss"); + } + else + { + if (v_2 >= sbp.limit) + { + break; + } + + sbp.cursor = v_2 + 1; + } + } + + sbp.cursor = v_1; + while (true) + { + var v_3 = sbp.cursor; + while (true) + { + var v_4 = sbp.cursor; + if (sbp.InGrouping(g_v, 97, 252)) + { + var v_5 = sbp.cursor; + sbp.bra = v_5; + if (habr1("u", "U", v_4)) + { + break; + } + + sbp.cursor = v_5; + if (habr1("y", "Y", v_4)) + { + break; + } + } + + if (v_4 >= sbp.limit) + { + sbp.cursor = v_3; + return; + } + + sbp.cursor = v_4 + 1; + } + } + } + + private bool habr2() + { + while (!sbp.InGrouping(g_v, 97, 252)) + { + if (sbp.cursor >= sbp.limit) + { + return true; + } + + sbp.cursor++; + } + + while (!sbp.OutGrouping(g_v, 97, 252)) + { + if (sbp.cursor >= sbp.limit) + { + return true; + } + + sbp.cursor++; + } + + return false; + } + + private void r_mark_regions() + { + I_p1 = sbp.limit; + I_p2 = I_p1; + var c = sbp.cursor + 3; + if (0 <= c && c <= sbp.limit) + { + I_x = c; + if (!habr2()) + { + I_p1 = sbp.cursor; + if (I_p1 < I_x) + { + I_p1 = I_x; + } + + if (!habr2()) + { + I_p2 = sbp.cursor; + } + } + } + } + + private void r_postlude() + { + while (true) + { + var v_1 = sbp.cursor; + sbp.bra = v_1; + var among_var = sbp.FindAmong(a_0, 6); + if (among_var == 0) /* !! */ + { + return; + } + + sbp.ket = sbp.cursor; + switch (among_var) + { + case 1: + sbp.SliceFrom("y"); + break; + case 2: + case 5: + sbp.SliceFrom("u"); + break; + case 3: + sbp.SliceFrom("a"); + break; + case 4: + sbp.SliceFrom("o"); + break; + case 6: + if (sbp.cursor >= sbp.limit) + { + return; + } + + sbp.cursor++; + break; + } + } + } + + private bool r_R1() + { + return I_p1 <= sbp.cursor; + } + + private bool r_R2() + { + return I_p2 <= sbp.cursor; + } + + private void r_standard_suffix() + { + var v_1 = sbp.limit - sbp.cursor; + sbp.ket = sbp.cursor; + var among_var = sbp.FindAmongBackwards(a_1, 7); + if (among_var != 0) /* !! */ + { + sbp.bra = sbp.cursor; + if (r_R1()) + { + switch (among_var) + { + case 1: + sbp.SliceDelete(); + break; + case 2: + sbp.SliceDelete(); + sbp.ket = sbp.cursor; + if (sbp.EqualsSegmentBackwards(1, "s")) + { + sbp.bra = sbp.cursor; + if (sbp.EqualsSegmentBackwards(3, "nis")) + { + sbp.SliceDelete(); + } + } + + break; + case 3: + if (sbp.InGroupingBackwards(g_s_ending, 98, 116)) + { + sbp.SliceDelete(); + } + + break; + } + } + } + + sbp.cursor = sbp.limit - v_1; + sbp.ket = sbp.cursor; + among_var = sbp.FindAmongBackwards(a_2, 4); + if (among_var != 0) /* !! */ + { + sbp.bra = sbp.cursor; + if (r_R1()) + { + switch (among_var) + { + case 1: + sbp.SliceDelete(); + break; + case 2: + if (sbp.InGroupingBackwards(g_st_ending, 98, 116)) + { + var c = sbp.cursor - 3; + if (sbp.limit_backward <= c && c <= sbp.limit) + { + sbp.cursor = c; + sbp.SliceDelete(); + } + } + + break; + } + } + } + + sbp.cursor = sbp.limit - v_1; + sbp.ket = sbp.cursor; + among_var = sbp.FindAmongBackwards(a_4, 8); + if (among_var != 0) /* !! */ + { + sbp.bra = sbp.cursor; + if (r_R2()) + { + switch (among_var) + { + case 1: + sbp.SliceDelete(); + sbp.ket = sbp.cursor; + if (sbp.EqualsSegmentBackwards(2, "ig")) + { + sbp.bra = sbp.cursor; + var v_2 = sbp.limit - sbp.cursor; + if (!sbp.EqualsSegmentBackwards(1, "e")) + { + sbp.cursor = sbp.limit - v_2; + if (r_R2()) + { + sbp.SliceDelete(); + } + } + } + + break; + case 2: + var v_3 = sbp.limit - sbp.cursor; + if (!sbp.EqualsSegmentBackwards(1, "e")) + { + sbp.cursor = sbp.limit - v_3; + sbp.SliceDelete(); + } + + break; + case 3: + sbp.SliceDelete(); + sbp.ket = sbp.cursor; + var v_4 = sbp.limit - sbp.cursor; + if (!sbp.EqualsSegmentBackwards(2, "er")) + { + sbp.cursor = sbp.limit - v_4; + if (!sbp.EqualsSegmentBackwards(2, "en")) + { + break; + } + } + + sbp.bra = sbp.cursor; + if (r_R1()) + { + sbp.SliceDelete(); + } + + break; + case 4: + sbp.SliceDelete(); + sbp.ket = sbp.cursor; + among_var = sbp.FindAmongBackwards(a_3, 2); + if (among_var != 0) /* !! */ + { + sbp.bra = sbp.cursor; + if (r_R2() && among_var == 1) + { + sbp.SliceDelete(); + } + } + + break; + } + } + } + } + + private void StemImpl() + { + var v_1 = sbp.cursor; + r_prelude(); + sbp.cursor = v_1; + r_mark_regions(); + sbp.limit_backward = v_1; + sbp.cursor = sbp.limit; + r_standard_suffix(); + sbp.cursor = sbp.limit_backward; + r_postlude(); + } + + #region Data + + private static readonly Among[] a_0 = + { + new Among("", -1, 6), + new Among("U", 0, 2), + new Among("Y", 0, 1), + new Among("ä", 0, 3), + new Among("ö", 0, 4), + new Among("ü", 0, 5) + }; + + private static readonly Among[] a_1 = + { + new Among("e", -1, 2), + new Among("em", -1, 1), + new Among("en", -1, 2), + new Among("ern", -1, 1), + new Among("er", -1, 1), + new Among("s", -1, 3), + new Among("es", 5, 2) + }; + + private static readonly Among[] a_2 = + { + new Among("en", -1, 1), + new Among("er", -1, 1), + new Among("st", -1, 2), + new Among("est", 2, 1) + }; + + private static readonly Among[] a_3 = + { + new Among("ig", -1, 1), + new Among("lich", -1, 1) + }; + + private static readonly Among[] a_4 = + { + new Among("end", -1, 1), + new Among("ig", -1, 2), + new Among("ung", -1, 1), + new Among("lich", -1, 3), + new Among("isch", -1, 2), + new Among("ik", -1, 2), + new Among("heit", -1, 3), + new Among("keit", -1, 4) + }; + + private static readonly int[] g_v = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 32, 8}; + private static readonly int[] g_s_ending = {117, 30, 5}; + private static readonly int[] g_st_ending = {117, 30, 4}; + + #endregion + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/de/GermanStopWordFilter.cs b/LunrCore/Globalization/de/GermanStopWordFilter.cs new file mode 100644 index 0000000..3d12aa6 --- /dev/null +++ b/LunrCore/Globalization/de/GermanStopWordFilter.cs @@ -0,0 +1,15 @@ +using System; + +namespace Lunr.Globalization.de +{ + public sealed class GermanStopWordFilter : StopWordFilterBase + { + private const string Data = + @"aber alle allem allen aller alles als also am an ander andere anderem anderen anderer anderes anderm andern anderr anders auch auf aus bei bin bis bist da damit dann das dasselbe dazu daß dein deine deinem deinen deiner deines dem demselben den denn denselben der derer derselbe derselben des desselben dessen dich die dies diese dieselbe dieselben diesem diesen dieser dieses dir doch dort du durch ein eine einem einen einer eines einig einige einigem einigen einiger einiges einmal er es etwas euch euer eure eurem euren eurer eures für gegen gewesen hab habe haben hat hatte hatten hier hin hinter ich ihm ihn ihnen ihr ihre ihrem ihren ihrer ihres im in indem ins ist jede jedem jeden jeder jedes jene jenem jenen jener jenes jetzt kann kein keine keinem keinen keiner keines können könnte machen man manche manchem manchen mancher manches mein meine meinem meinen meiner meines mich mir mit muss musste nach nicht nichts noch nun nur ob oder ohne sehr sein seine seinem seinen seiner seines selbst sich sie sind so solche solchem solchen solcher solches soll sollte sondern sonst um und uns unse unsem unsen unser unses unter viel vom von vor war waren warst was weg weil weiter welche welchem welchen welcher welches wenn werde werden wie wieder will wir wird wirst wo wollen wollte während würde würden zu zum zur zwar zwischen über"; + + private static readonly ISet WordList = + new Set(Data.Split(new[] {" "}, StringSplitOptions.RemoveEmptyEntries)); + + protected override ISet StopWords => WordList; + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/de/GermanTrimmer.cs b/LunrCore/Globalization/de/GermanTrimmer.cs new file mode 100644 index 0000000..85296fa --- /dev/null +++ b/LunrCore/Globalization/de/GermanTrimmer.cs @@ -0,0 +1,10 @@ +using System.Text.RegularExpressions; + +namespace Lunr.Globalization.de +{ + public sealed class GermanTrimmer : TrimmerBase + { + private static readonly Regex Pattern = new Regex("(^[^" + WordCharacters.German + "]+|[^" + WordCharacters.German + "]+$)", RegexOptions.Compiled); + public override string Trim(string s) => Pattern.Replace(s, ""); + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/de/Index.cs b/LunrCore/Globalization/de/Index.cs new file mode 100644 index 0000000..82d0f83 --- /dev/null +++ b/LunrCore/Globalization/de/Index.cs @@ -0,0 +1,49 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using Idx = Lunr.Index; + +namespace Lunr.Globalization.de +{ + public static class Index + { + public static async Task Build( + Func? config = null!, + Tokenizer? tokenizer = null!, + PipelineFunctionRegistry? registry = null!, + IEnumerable? indexingPipeline = null!, + IEnumerable? searchPipeline = null!, + params Field[] fields) + { + Pipeline.Function trimmerFunction = new GermanTrimmer().FilterFunction; + Pipeline.Function filterFunction = new GermanStopWordFilter().FilterFunction; + Pipeline.Function stemmerFunction = new GermanStemmer().StemmerFunction; + + registry ??= new PipelineFunctionRegistry(); + registry.Add("trimmer", trimmerFunction); + registry.Add("stopWordFilter", filterFunction); + registry.Add("stemmer", stemmerFunction); + + Pipeline idxPipeline = indexingPipeline is null + ? new Pipeline(registry, trimmerFunction, filterFunction, stemmerFunction) + : new Pipeline(registry, indexingPipeline.Select(function => registry[function]).ToArray()); + Pipeline srchPipeline = searchPipeline is null + ? new Pipeline(registry, stemmerFunction) + : new Pipeline(registry, searchPipeline.Select(function => registry[function]).ToArray()); + + var builder = new Builder( + indexingPipeline: idxPipeline, + searchPipeline: srchPipeline, + tokenizer: tokenizer ?? new Tokenizer(), + fields: fields); + + if (config != null) + { + await config(builder); + } + + return builder.Build(); + } + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/es/SpanishStopWordFilter.cs b/LunrCore/Globalization/es/SpanishStopWordFilter.cs new file mode 100644 index 0000000..895d97b --- /dev/null +++ b/LunrCore/Globalization/es/SpanishStopWordFilter.cs @@ -0,0 +1,15 @@ +using System; + +namespace Lunr.Globalization.es +{ + public sealed class SpanishStopWordFilter : StopWordFilterBase + { + private const string Data = + @"a al algo algunas algunos ante antes como con contra cual cuando de del desde donde durante e el ella ellas ellos en entre era erais eran eras eres es esa esas ese eso esos esta estaba estabais estaban estabas estad estada estadas estado estados estamos estando estar estaremos estará estarán estarás estaré estaréis estaría estaríais estaríamos estarían estarías estas este estemos esto estos estoy estuve estuviera estuvierais estuvieran estuvieras estuvieron estuviese estuvieseis estuviesen estuvieses estuvimos estuviste estuvisteis estuviéramos estuviésemos estuvo está estábamos estáis están estás esté estéis estén estés fue fuera fuerais fueran fueras fueron fuese fueseis fuesen fueses fui fuimos fuiste fuisteis fuéramos fuésemos ha habida habidas habido habidos habiendo habremos habrá habrán habrás habré habréis habría habríais habríamos habrían habrías habéis había habíais habíamos habían habías han has hasta hay haya hayamos hayan hayas hayáis he hemos hube hubiera hubierais hubieran hubieras hubieron hubiese hubieseis hubiesen hubieses hubimos hubiste hubisteis hubiéramos hubiésemos hubo la las le les lo los me mi mis mucho muchos muy más mí mía mías mío míos nada ni no nos nosotras nosotros nuestra nuestras nuestro nuestros o os otra otras otro otros para pero poco por porque que quien quienes qué se sea seamos sean seas seremos será serán serás seré seréis sería seríais seríamos serían serías seáis sido siendo sin sobre sois somos son soy su sus suya suyas suyo suyos sí también tanto te tendremos tendrá tendrán tendrás tendré tendréis tendría tendríais tendríamos tendrían tendrías tened tenemos tenga tengamos tengan tengas tengo tengáis tenida tenidas tenido tenidos teniendo tenéis tenía teníais teníamos tenían tenías ti tiene tienen tienes todo todos tu tus tuve tuviera tuvierais tuvieran tuvieras tuvieron tuviese tuvieseis tuviesen tuvieses tuvimos tuviste tuvisteis tuviéramos tuviésemos tuvo tuya tuyas tuyo tuyos tú un una uno unos vosotras vosotros vuestra vuestras vuestro vuestros y ya yo él éramos"; + + private static readonly ISet WordList = + new Set(Data.Split(new[] {" "}, StringSplitOptions.RemoveEmptyEntries)); + + protected override ISet StopWords => WordList; + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/es/SpanishTrimmer.cs b/LunrCore/Globalization/es/SpanishTrimmer.cs new file mode 100644 index 0000000..0810ea7 --- /dev/null +++ b/LunrCore/Globalization/es/SpanishTrimmer.cs @@ -0,0 +1,10 @@ +using System.Text.RegularExpressions; + +namespace Lunr.Globalization.es +{ + public sealed class SpanishTrimmer : TrimmerBase + { + private static readonly Regex Pattern = new Regex("(^[^" + WordCharacters.Spanish + "]+|[^" + WordCharacters.Spanish + "]+$)", RegexOptions.Compiled); + public override string Trim(string s) => Pattern.Replace(s, ""); + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/fi/FinnishStopWordFilter.cs b/LunrCore/Globalization/fi/FinnishStopWordFilter.cs new file mode 100644 index 0000000..bbdc25b --- /dev/null +++ b/LunrCore/Globalization/fi/FinnishStopWordFilter.cs @@ -0,0 +1,15 @@ +using System; + +namespace Lunr.Globalization.fi +{ + public sealed class FinnishStopWordFilter : StopWordFilterBase + { + private const string Data = + @"ei eivät emme en et ette että he heidän heidät heihin heille heillä heiltä heissä heistä heitä hän häneen hänelle hänellä häneltä hänen hänessä hänestä hänet häntä itse ja johon joiden joihin joiksi joilla joille joilta joina joissa joista joita joka joksi jolla jolle jolta jona jonka jos jossa josta jota jotka kanssa keiden keihin keiksi keille keillä keiltä keinä keissä keistä keitä keneen keneksi kenelle kenellä keneltä kenen kenenä kenessä kenestä kenet ketkä ketkä ketä koska kuin kuka kun me meidän meidät meihin meille meillä meiltä meissä meistä meitä mihin miksi mikä mille millä miltä minkä minkä minua minulla minulle minulta minun minussa minusta minut minuun minä minä missä mistä mitkä mitä mukaan mutta ne niiden niihin niiksi niille niillä niiltä niin niin niinä niissä niistä niitä noiden noihin noiksi noilla noille noilta noin noina noissa noista noita nuo nyt näiden näihin näiksi näille näillä näiltä näinä näissä näistä näitä nämä ole olemme olen olet olette oli olimme olin olisi olisimme olisin olisit olisitte olisivat olit olitte olivat olla olleet ollut on ovat poikki se sekä sen siihen siinä siitä siksi sille sillä sillä siltä sinua sinulla sinulle sinulta sinun sinussa sinusta sinut sinuun sinä sinä sitä tai te teidän teidät teihin teille teillä teiltä teissä teistä teitä tuo tuohon tuoksi tuolla tuolle tuolta tuon tuona tuossa tuosta tuota tähän täksi tälle tällä tältä tämä tämän tänä tässä tästä tätä vaan vai vaikka yli"; + + private static readonly ISet WordList = + new Set(Data.Split(new[] {" "}, StringSplitOptions.RemoveEmptyEntries)); + + protected override ISet StopWords => WordList; + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/fi/FinnishTrimmer.cs b/LunrCore/Globalization/fi/FinnishTrimmer.cs new file mode 100644 index 0000000..58cd7c8 --- /dev/null +++ b/LunrCore/Globalization/fi/FinnishTrimmer.cs @@ -0,0 +1,10 @@ +using System.Text.RegularExpressions; + +namespace Lunr.Globalization.fi +{ + public sealed class FinnishTrimmer : TrimmerBase + { + private static readonly Regex Pattern = new Regex("(^[^" + WordCharacters.Finnish + "]+|[^" + WordCharacters.Finnish + "]+$)", RegexOptions.Compiled); + public override string Trim(string s) => Pattern.Replace(s, ""); + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/fr/FrenchStemmer.cs b/LunrCore/Globalization/fr/FrenchStemmer.cs new file mode 100644 index 0000000..116c1fd --- /dev/null +++ b/LunrCore/Globalization/fr/FrenchStemmer.cs @@ -0,0 +1,929 @@ +namespace Lunr.Globalization.fr +{ + public sealed class FrenchStemmer : StemmerBase + { + private readonly int I_p1; + private readonly int I_p2; + private readonly int I_pV; + private readonly SnowballProgram sbp; + + public FrenchStemmer() + { + sbp = new SnowballProgram(); + I_p2 = 0; + I_p1 = 0; + I_pV = 0; + } + + public override string Stem(string w) + { + sbp.SetCurrent(w); + StemImpl(); + return sbp.GetCurrent(); + } + + private bool habr1(string c1, string c2, int v_1) + { + if (sbp.EqualsSegment(1, c1)) + { + sbp.ket = sbp.cursor; + if (sbp.InGrouping(g_v, 97, 251)) + { + sbp.SliceFrom(c2); + sbp.cursor = v_1; + return true; + } + } + + return false; + } + + private bool habr2(string c1, string c2, int v_1) + { + if (sbp.EqualsSegment(1, c1)) + { + sbp.ket = sbp.cursor; + sbp.SliceFrom(c2); + sbp.cursor = v_1; + return true; + } + + return false; + } + + private void r_prelude() + { + while (true) + { + var v_1 = sbp.cursor; + if (sbp.InGrouping(g_v, 97, 251)) + { + sbp.bra = sbp.cursor; + var v_2 = sbp.cursor; + if (habr1("u", "U", v_1)) + { + continue; + } + + sbp.cursor = v_2; + if (habr1("i", "I", v_1)) + { + continue; + } + + sbp.cursor = v_2; + if (habr2("y", "Y", v_1)) + { + continue; + } + } + + sbp.cursor = v_1; + sbp.bra = v_1; + + if (!habr1("y", "Y", v_1)) + { + sbp.cursor = v_1; + if (sbp.EqualsSegment(1, "q")) + { + sbp.bra = sbp.cursor; + if (habr2("u", "U", v_1)) + { + continue; + } + } + + sbp.cursor = v_1; + if (v_1 >= sbp.limit) + { + return; + } + + sbp.cursor++; + } + } + } + + private bool habr3() + { + while (!sbp.InGrouping(g_v, 97, 251)) + { + if (sbp.cursor >= sbp.limit) + { + return true; + } + + sbp.cursor++; + } + + while (!sbp.OutGrouping(g_v, 97, 251)) + { + if (sbp.cursor >= sbp.limit) + { + return true; + } + + sbp.cursor++; + } + + return false; + } + + private void r_mark_regions() + { + var v_1 = sbp.cursor; + var I_pV = sbp.limit; + var I_p1 = I_pV; + var I_p2 = I_pV; + if (sbp.InGrouping(g_v, 97, 251) && sbp.InGrouping(g_v, 97, 251) && + sbp.cursor < sbp.limit) + { + sbp.cursor++; + } + else + { + sbp.cursor = v_1; + if (sbp.FindAmong(a_0, 3) == 0) /* !! */ + { + sbp.cursor = v_1; + do + { + if (sbp.cursor >= sbp.limit) + { + sbp.cursor = I_pV; + break; + } + + sbp.cursor++; + } while (!sbp.InGrouping(g_v, 97, 251)); + } + } + + I_pV = sbp.cursor; + sbp.cursor = v_1; + if (!habr3()) + { + I_p1 = sbp.cursor; + if (!habr3()) + { + I_p2 = sbp.cursor; + } + } + } + + private void r_postlude() + { + while (true) + { + var v_1 = sbp.cursor; + sbp.bra = v_1; + var among_var = sbp.FindAmong(a_1, 4); + if (among_var == 0) /* !! */ + { + break; + } + + sbp.ket = sbp.cursor; + switch (among_var) + { + case 1: + sbp.SliceFrom("i"); + break; + case 2: + sbp.SliceFrom("u"); + break; + case 3: + sbp.SliceFrom("y"); + break; + case 4: + if (sbp.cursor >= sbp.limit) + { + return; + } + + sbp.cursor++; + break; + } + } + } + + private bool r_RV() + { + return I_pV <= sbp.cursor; + } + + private bool r_R1() + { + return I_p1 <= sbp.cursor; + } + + private bool r_R2() + { + return I_p2 <= sbp.cursor; + } + + private bool r_standard_suffix() + { + sbp.ket = sbp.cursor; + var among_var = sbp.FindAmongBackwards(a_4, 43); + if (among_var != 0) /* !! */ + { + sbp.bra = sbp.cursor; + switch (among_var) + { + case 1: + if (!r_R2()) + { + return false; + } + + sbp.SliceDelete(); + break; + case 2: + if (!r_R2()) + { + return false; + } + + sbp.SliceDelete(); + sbp.ket = sbp.cursor; + if (sbp.EqualsSegmentBackwards(2, "ic")) + { + sbp.bra = sbp.cursor; + if (!r_R2()) + { + sbp.SliceFrom("iqU"); + } + else + { + sbp.SliceDelete(); + } + } + + break; + case 3: + if (!r_R2()) + { + return false; + } + + sbp.SliceFrom("log"); + break; + case 4: + if (!r_R2()) + { + return false; + } + + sbp.SliceFrom("u"); + break; + case 5: + if (!r_R2()) + { + return false; + } + + sbp.SliceFrom("ent"); + break; + case 6: + if (!r_RV()) + { + return false; + } + + sbp.SliceDelete(); + sbp.ket = sbp.cursor; + among_var = sbp.FindAmongBackwards(a_2, 6); + if (among_var != 0) /* !! */ + { + sbp.bra = sbp.cursor; + switch (among_var) + { + case 1: + if (r_R2()) + { + sbp.SliceDelete(); + sbp.ket = sbp.cursor; + if (sbp.EqualsSegmentBackwards(2, "at")) + { + sbp.bra = sbp.cursor; + if (r_R2()) + { + sbp.SliceDelete(); + } + } + } + + break; + case 2: + if (r_R2()) + { + sbp.SliceDelete(); + } + else if (r_R1()) + { + sbp.SliceFrom("eux"); + } + + break; + case 3: + if (r_R2()) + { + sbp.SliceDelete(); + } + + break; + case 4: + if (r_RV()) + { + sbp.SliceFrom("i"); + } + + break; + } + } + + break; + case 7: + if (!r_R2()) + { + return false; + } + + sbp.SliceDelete(); + sbp.ket = sbp.cursor; + among_var = sbp.FindAmongBackwards(a_3, 3); + if (among_var != 0) /* !! */ + { + sbp.bra = sbp.cursor; + switch (among_var) + { + case 1: + if (r_R2()) + { + sbp.SliceDelete(); + } + else + { + sbp.SliceFrom("abl"); + } + + break; + case 2: + if (r_R2()) + { + sbp.SliceDelete(); + } + else + { + sbp.SliceFrom("iqU"); + } + + break; + case 3: + if (r_R2()) + { + sbp.SliceDelete(); + } + + break; + } + } + + break; + case 8: + if (!r_R2()) + { + return false; + } + + sbp.SliceDelete(); + sbp.ket = sbp.cursor; + if (sbp.EqualsSegmentBackwards(2, "at")) + { + sbp.bra = sbp.cursor; + if (r_R2()) + { + sbp.SliceDelete(); + sbp.ket = sbp.cursor; + if (sbp.EqualsSegmentBackwards(2, "ic")) + { + sbp.bra = sbp.cursor; + if (r_R2()) + { + sbp.SliceDelete(); + } + else + { + sbp.SliceFrom("iqU"); + } + } + } + } + + break; + case 9: + sbp.SliceFrom("eau"); + break; + case 10: + if (!r_R1()) + { + return false; + } + + sbp.SliceFrom("al"); + break; + case 11: + if (r_R2()) + { + sbp.SliceDelete(); + } + else if (!r_R1()) + { + return false; + } + else + { + sbp.SliceFrom("eux"); + } + + break; + case 12: + if (!r_R1() || !sbp.OutGroupingBackwards(g_v, 97, 251)) + { + return false; + } + + sbp.SliceDelete(); + break; + case 13: + if (r_RV()) + { + sbp.SliceFrom("ant"); + } + + return false; + case 14: + if (r_RV()) + { + sbp.SliceFrom("ent"); + } + + return false; + case 15: + var v_1 = sbp.limit - sbp.cursor; + if (sbp.InGroupingBackwards(g_v, 97, 251) && r_RV()) + { + sbp.cursor = sbp.limit - v_1; + sbp.SliceDelete(); + } + + return false; + } + + return true; + } + + return false; + } + + private bool r_i_verb_suffix() + { + if (sbp.cursor < I_pV) + { + return false; + } + + var v_1 = sbp.limit_backward; + sbp.limit_backward = I_pV; + sbp.ket = sbp.cursor; + var among_var = sbp.FindAmongBackwards(a_5, 35); + if (among_var == 0) /* !! */ + { + sbp.limit_backward = v_1; + return false; + } + + sbp.bra = sbp.cursor; + if (among_var == 1) + { + if (!sbp.OutGroupingBackwards(g_v, 97, 251)) + { + sbp.limit_backward = v_1; + return false; + } + + sbp.SliceDelete(); + } + + sbp.limit_backward = v_1; + return true; + } + + private bool r_verb_suffix() + { + if (sbp.cursor < I_pV) + { + return false; + } + + var v_2 = sbp.limit_backward; + sbp.limit_backward = I_pV; + sbp.ket = sbp.cursor; + var among_var = sbp.FindAmongBackwards(a_6, 38); + if (among_var == 0) /* !! */ + { + sbp.limit_backward = v_2; + return false; + } + + sbp.bra = sbp.cursor; + switch (among_var) + { + case 1: + if (!r_R2()) + { + sbp.limit_backward = v_2; + return false; + } + + sbp.SliceDelete(); + break; + case 2: + sbp.SliceDelete(); + break; + case 3: + sbp.SliceDelete(); + var v_3 = sbp.limit - sbp.cursor; + sbp.ket = sbp.cursor; + if (sbp.EqualsSegmentBackwards(1, "e")) + { + sbp.bra = sbp.cursor; + sbp.SliceDelete(); + } + else + { + sbp.cursor = sbp.limit - v_3; + } + + break; + } + + sbp.limit_backward = v_2; + return true; + } + + private void r_residual_suffix() + { + var v_1 = sbp.limit - sbp.cursor; + sbp.ket = sbp.cursor; + if (sbp.EqualsSegmentBackwards(1, "s")) + { + sbp.bra = sbp.cursor; + var v_2 = sbp.limit - sbp.cursor; + if (sbp.OutGroupingBackwards(g_keep_with_s, 97, 232)) + { + sbp.cursor = sbp.limit - v_2; + sbp.SliceDelete(); + } + else + { + sbp.cursor = sbp.limit - v_1; + } + } + else + { + sbp.cursor = sbp.limit - v_1; + } + + if (sbp.cursor >= I_pV) + { + var v_4 = sbp.limit_backward; + sbp.limit_backward = I_pV; + sbp.ket = sbp.cursor; + var among_var = sbp.FindAmongBackwards(a_7, 7); + if (among_var != 0) /* !! */ + { + sbp.bra = sbp.cursor; + switch (among_var) + { + case 1: + if (r_R2()) + { + var v_5 = sbp.limit - sbp.cursor; + if (!sbp.EqualsSegmentBackwards(1, "s")) + { + sbp.cursor = sbp.limit - v_5; + if (!sbp.EqualsSegmentBackwards(1, "t")) + { + break; + } + } + + sbp.SliceDelete(); + } + + break; + case 2: + sbp.SliceFrom("i"); + break; + case 3: + sbp.SliceDelete(); + break; + case 4: + if (sbp.EqualsSegmentBackwards(2, "gu")) + { + sbp.SliceDelete(); + } + + break; + } + } + + sbp.limit_backward = v_4; + } + } + + private void r_un_double() + { + var v_1 = sbp.limit - sbp.cursor; + if (sbp.FindAmongBackwards(a_8, 5) != 0) /* !! */ + { + sbp.cursor = sbp.limit - v_1; + sbp.ket = sbp.cursor; + + if (sbp.cursor > sbp.limit_backward) + { + sbp.cursor--; + sbp.bra = sbp.cursor; + sbp.SliceDelete(); + } + } + } + + private void r_un_accent() + { + var v_2 = 1; + while (sbp.OutGroupingBackwards(g_v, 97, 251)) + { + v_2--; + } + + if (v_2 <= 0) + { + sbp.ket = sbp.cursor; + var v_1 = sbp.limit - sbp.cursor; + if (!sbp.EqualsSegmentBackwards(1, "é")) + { + sbp.cursor = sbp.limit - v_1; + if (!sbp.EqualsSegmentBackwards(1, "è")) + { + return; + } + } + + sbp.bra = sbp.cursor; + sbp.SliceFrom("e"); + } + } + + private void habr5() + { + if (!r_standard_suffix()) + { + sbp.cursor = sbp.limit; + if (!r_i_verb_suffix()) + { + sbp.cursor = sbp.limit; + if (!r_verb_suffix()) + { + sbp.cursor = sbp.limit; + r_residual_suffix(); + return; + } + } + } + + sbp.cursor = sbp.limit; + sbp.ket = sbp.cursor; + if (sbp.EqualsSegmentBackwards(1, "Y")) + { + sbp.bra = sbp.cursor; + sbp.SliceFrom("i"); + } + else + { + sbp.cursor = sbp.limit; + if (sbp.EqualsSegmentBackwards(1, "ç")) + { + sbp.bra = sbp.cursor; + sbp.SliceFrom("c"); + } + } + } + + private void StemImpl() + { + var v_1 = sbp.cursor; + r_prelude(); + sbp.cursor = v_1; + r_mark_regions(); + sbp.limit_backward = v_1; + sbp.cursor = sbp.limit; + habr5(); + sbp.cursor = sbp.limit; + r_un_double(); + sbp.cursor = sbp.limit; + r_un_accent(); + sbp.cursor = sbp.limit_backward; + r_postlude(); + } + + #region Data + + private static readonly Among[] a_0 = + { + new Among("col", -1, -1), + new Among("par", -1, -1), + new Among("tap", -1, -1) + }; + + private static readonly Among[] a_1 = + { + new Among("", -1, 4), + new Among("I", 0, 1), + new Among("U", 0, 2), + new Among("Y", 0, 3) + }; + + private static readonly Among[] a_2 = + { + new Among("iqU", -1, 3), + new Among("abl", -1, 3), + new Among("Ièr", -1, 4), + new Among("ièr", -1, 4), + new Among("eus", -1, 2), + new Among("iv", -1, 1) + }; + + private static readonly Among[] a_3 = + { + new Among("ic", -1, 2), + new Among("abil", -1, 1), + new Among("iv", -1, 3) + }; + + private static readonly Among[] a_4 = + { + new Among("iqUe", -1, 1), + new Among("atrice", -1, 2), + new Among("ance", -1, 1), + new Among("ence", -1, 5), + new Among("logie", -1, 3), + new Among("able", -1, 1), + new Among("isme", -1, 1), + new Among("euse", -1, 11), + new Among("iste", -1, 1), + new Among("ive", -1, 8), + new Among("if", -1, 8), + new Among("usion", -1, 4), + new Among("ation", -1, 2), + new Among("ution", -1, 4), + new Among("ateur", -1, 2), + new Among("iqUes", -1, 1), + new Among("atrices", -1, 2), + new Among("ances", -1, 1), + new Among("ences", -1, 5), + new Among("logies", -1, 3), + new Among("ables", -1, 1), + new Among("ismes", -1, 1), + new Among("euses", -1, 11), + new Among("istes", -1, 1), + new Among("ives", -1, 8), + new Among("ifs", -1, 8), + new Among("usions", -1, 4), + new Among("ations", -1, 2), + new Among("utions", -1, 4), + new Among("ateurs", -1, 2), + new Among("ments", -1, 15), + new Among("ements", 30, 6), + new Among("issements", 31, 12), + new Among("ités", -1, 7), + new Among("ment", -1, 15), + new Among("ement", 34, 6), + new Among("issement", 35, 12), + new Among("amment", 34, 13), + new Among("emment", 34, 14), + new Among("aux", -1, 10), + new Among("eaux", 39, 9), + new Among("eux", -1, 1), + new Among("ité", -1, 7) + }; + + private static readonly Among[] a_5 = + { + new Among("ira", -1, 1), + new Among("ie", -1, 1), + new Among("isse", -1, 1), + new Among("issante", -1, 1), + new Among("i", -1, 1), + new Among("irai", 4, 1), + new Among("ir", -1, 1), + new Among("iras", -1, 1), + new Among("ies", -1, 1), + new Among("îmes", -1, 1), + new Among("isses", -1, 1), + new Among("issantes", -1, 1), + new Among("îtes", -1, 1), + new Among("is", -1, 1), + new Among("irais", 13, 1), + new Among("issais", 13, 1), + new Among("irions", -1, 1), + new Among("issions", -1, 1), + new Among("irons", -1, 1), + new Among("issons", -1, 1), + new Among("issants", -1, 1), + new Among("it", -1, 1), + new Among("irait", 21, 1), + new Among("issait", 21, 1), + new Among("issant", -1, 1), + new Among("iraIent", -1, 1), + new Among("issaIent", -1, 1), + new Among("irent", -1, 1), + new Among("issent", -1, 1), + new Among("iront", -1, 1), + new Among("ît", -1, 1), + new Among("iriez", -1, 1), + new Among("issiez", -1, 1), + new Among("irez", -1, 1), + new Among("issez", -1, 1) + }; + + private static readonly Among[] a_6 = + { + new Among("a", -1, 3), + new Among("era", 0, 2), + new Among("asse", -1, 3), + new Among("ante", -1, 3), + new Among("ée", -1, 2), + new Among("ai", -1, 3), + new Among("erai", 5, 2), + new Among("er", -1, 2), + new Among("as", -1, 3), + new Among("eras", 8, 2), + new Among("âmes", -1, 3), + new Among("asses", -1, 3), + new Among("antes", -1, 3), + new Among("âtes", -1, 3), + new Among("ées", -1, 2), + new Among("ais", -1, 3), + new Among("erais", 15, 2), + new Among("ions", -1, 1), + new Among("erions", 17, 2), + new Among("assions", 17, 3), + new Among("erons", -1, 2), + new Among("ants", -1, 3), + new Among("és", -1, 2), + new Among("ait", -1, 3), + new Among("erait", 23, 2), + new Among("ant", -1, 3), + new Among("aIent", -1, 3), + new Among("eraIent", 26, 2), + new Among("èrent", -1, 2), + new Among("assent", -1, 3), + new Among("eront", -1, 2), + new Among("ât", -1, 3), + new Among("ez", -1, 2), + new Among("iez", 32, 2), + new Among("eriez", 33, 2), + new Among("assiez", 33, 3), + new Among("erez", 32, 2), + new Among("é", -1, 2) + }; + + private static readonly Among[] a_7 = + { + new Among("e", -1, 3), + new Among("Ière", 0, 2), + new Among("ière", 0, 2), + new Among("ion", -1, 1), + new Among("Ier", -1, 2), + new Among("ier", -1, 2), + new Among("ë", -1, 4) + }; + + private static readonly Among[] a_8 = + { + new Among("ell", -1, -1), + new Among("eill", -1, -1), + new Among("enn", -1, -1), + new Among("onn", -1, -1), + new Among("ett", -1, -1) + }; + + private static readonly int[] g_v = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 130, 103, 8, 5}; + private static readonly int[] g_keep_with_s = {1, 65, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128}; + + #endregion + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/fr/FrenchStopWordFilter.cs b/LunrCore/Globalization/fr/FrenchStopWordFilter.cs new file mode 100644 index 0000000..ee8c021 --- /dev/null +++ b/LunrCore/Globalization/fr/FrenchStopWordFilter.cs @@ -0,0 +1,15 @@ +using System; + +namespace Lunr.Globalization.fr +{ + public sealed class FrenchStopWordFilter : StopWordFilterBase + { + private const string Data = + @"ai aie aient aies ait as au aura aurai auraient aurais aurait auras aurez auriez aurions aurons auront aux avaient avais avait avec avez aviez avions avons ayant ayez ayons c ce ceci celà ces cet cette d dans de des du elle en es est et eu eue eues eurent eus eusse eussent eusses eussiez eussions eut eux eûmes eût eûtes furent fus fusse fussent fusses fussiez fussions fut fûmes fût fûtes ici il ils j je l la le les leur leurs lui m ma mais me mes moi mon même n ne nos notre nous on ont ou par pas pour qu que quel quelle quelles quels qui s sa sans se sera serai seraient serais serait seras serez seriez serions serons seront ses soi soient sois soit sommes son sont soyez soyons suis sur t ta te tes toi ton tu un une vos votre vous y à étaient étais était étant étiez étions été étée étées étés êtes"; + + private static readonly ISet WordList = + new Set(Data.Split(new[] {" "}, StringSplitOptions.RemoveEmptyEntries)); + + protected override ISet StopWords => WordList; + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/fr/FrenchTrimmer.cs b/LunrCore/Globalization/fr/FrenchTrimmer.cs new file mode 100644 index 0000000..b8cb4d8 --- /dev/null +++ b/LunrCore/Globalization/fr/FrenchTrimmer.cs @@ -0,0 +1,10 @@ +using System.Text.RegularExpressions; + +namespace Lunr.Globalization.fr +{ + public sealed class FrenchTrimmer : TrimmerBase + { + private static readonly Regex Pattern = new Regex("(^[^" + WordCharacters.French + "]+|[^" + WordCharacters.French + "]+$)", RegexOptions.Compiled); + public override string Trim(string s) => Pattern.Replace(s, ""); + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/fr/Index.cs b/LunrCore/Globalization/fr/Index.cs new file mode 100644 index 0000000..e2e6dea --- /dev/null +++ b/LunrCore/Globalization/fr/Index.cs @@ -0,0 +1,49 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using Idx = Lunr.Index; + +namespace Lunr.Globalization.fr +{ + public static class Index + { + public static async Task Build( + Func? config = null!, + Tokenizer? tokenizer = null!, + PipelineFunctionRegistry? registry = null!, + IEnumerable? indexingPipeline = null!, + IEnumerable? searchPipeline = null!, + params Field[] fields) + { + Pipeline.Function trimmerFunction = new FrenchTrimmer().FilterFunction; + Pipeline.Function filterFunction = new FrenchStopWordFilter().FilterFunction; + Pipeline.Function stemmerFunction = new FrenchStemmer().StemmerFunction; + + registry ??= new PipelineFunctionRegistry(); + registry.Add("trimmer", trimmerFunction); + registry.Add("stopWordFilter", filterFunction); + registry.Add("stemmer", stemmerFunction); + + Pipeline idxPipeline = indexingPipeline is null + ? new Pipeline(registry, trimmerFunction, filterFunction, stemmerFunction) + : new Pipeline(registry, indexingPipeline.Select(function => registry[function]).ToArray()); + Pipeline srchPipeline = searchPipeline is null + ? new Pipeline(registry, stemmerFunction) + : new Pipeline(registry, searchPipeline.Select(function => registry[function]).ToArray()); + + var builder = new Builder( + indexingPipeline: idxPipeline, + searchPipeline: srchPipeline, + tokenizer: tokenizer ?? new Tokenizer(), + fields: fields); + + if (config != null) + { + await config(builder); + } + + return builder.Build(); + } + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/hu/HungarianStopWordFilter.cs b/LunrCore/Globalization/hu/HungarianStopWordFilter.cs new file mode 100644 index 0000000..4680558 --- /dev/null +++ b/LunrCore/Globalization/hu/HungarianStopWordFilter.cs @@ -0,0 +1,15 @@ +using System; + +namespace Lunr.Globalization.hu +{ + public sealed class HungarianStopWordFilter : StopWordFilterBase + { + private const string Data = + @"a abban ahhoz ahogy ahol aki akik akkor alatt amely amelyek amelyekben amelyeket amelyet amelynek ami amikor amit amolyan amíg annak arra arról az azok azon azonban azt aztán azután azzal azért be belül benne bár cikk cikkek cikkeket csak de e ebben eddig egy egyes egyetlen egyik egyre egyéb egész ehhez ekkor el ellen elsõ elég elõ elõször elõtt emilyen ennek erre ez ezek ezen ezt ezzel ezért fel felé hanem hiszen hogy hogyan igen ill ill. illetve ilyen ilyenkor ismét ison itt jobban jó jól kell kellett keressünk keresztül ki kívül között közül legalább legyen lehet lehetett lenne lenni lesz lett maga magát majd majd meg mellett mely melyek mert mi mikor milyen minden mindenki mindent mindig mint mintha mit mivel miért most már más másik még míg nagy nagyobb nagyon ne nekem neki nem nincs néha néhány nélkül olyan ott pedig persze rá s saját sem semmi sok sokat sokkal szemben szerint szinte számára talán tehát teljes tovább továbbá több ugyanis utolsó után utána vagy vagyis vagyok valaki valami valamint való van vannak vele vissza viszont volna volt voltak voltam voltunk által általában át én éppen és így õ õk õket össze úgy új újabb újra"; + + private static readonly ISet WordList = + new Set(Data.Split(new[] {" "}, StringSplitOptions.RemoveEmptyEntries)); + + protected override ISet StopWords => WordList; + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/hu/HungarianTrimmer.cs b/LunrCore/Globalization/hu/HungarianTrimmer.cs new file mode 100644 index 0000000..2a89b3b --- /dev/null +++ b/LunrCore/Globalization/hu/HungarianTrimmer.cs @@ -0,0 +1,10 @@ +using System.Text.RegularExpressions; + +namespace Lunr.Globalization.hu +{ + public sealed class HungarianTrimmer : TrimmerBase + { + private static readonly Regex Pattern = new Regex("(^[^" + WordCharacters.Hungarian + "]+|[^" + WordCharacters.Hungarian + "]+$)", RegexOptions.Compiled); + public override string Trim(string s) => Pattern.Replace(s, ""); + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/it/Index.cs b/LunrCore/Globalization/it/Index.cs new file mode 100644 index 0000000..b903f48 --- /dev/null +++ b/LunrCore/Globalization/it/Index.cs @@ -0,0 +1,49 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using Idx = Lunr.Index; + +namespace Lunr.Globalization.it +{ + public static class Index + { + public static async Task Build( + Func? config = null!, + Tokenizer? tokenizer = null!, + PipelineFunctionRegistry? registry = null!, + IEnumerable? indexingPipeline = null!, + IEnumerable? searchPipeline = null!, + params Field[] fields) + { + Pipeline.Function trimmerFunction = new ItalianTrimmer().FilterFunction; + Pipeline.Function filterFunction = new ItalianStopWordFilter().FilterFunction; + Pipeline.Function stemmerFunction = new ItalianStemmer().StemmerFunction; + + registry ??= new PipelineFunctionRegistry(); + registry.Add("trimmer", trimmerFunction); + registry.Add("stopWordFilter", filterFunction); + registry.Add("stemmer", stemmerFunction); + + Pipeline idxPipeline = indexingPipeline is null + ? new Pipeline(registry, trimmerFunction, filterFunction, stemmerFunction) + : new Pipeline(registry, indexingPipeline.Select(function => registry[function]).ToArray()); + Pipeline srchPipeline = searchPipeline is null + ? new Pipeline(registry, stemmerFunction) + : new Pipeline(registry, searchPipeline.Select(function => registry[function]).ToArray()); + + var builder = new Builder( + indexingPipeline: idxPipeline, + searchPipeline: srchPipeline, + tokenizer: tokenizer ?? new Tokenizer(), + fields: fields); + + if (config != null) + { + await config(builder); + } + + return builder.Build(); + } + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/it/ItalianStemmer.cs b/LunrCore/Globalization/it/ItalianStemmer.cs new file mode 100644 index 0000000..2c5acc6 --- /dev/null +++ b/LunrCore/Globalization/it/ItalianStemmer.cs @@ -0,0 +1,804 @@ +namespace Lunr.Globalization.it +{ + public sealed class ItalianStemmer : StemmerBase + { + private int I_p1; + private int I_p2; + private int I_pV; + private readonly SnowballProgram sbp; + + public ItalianStemmer() + { + sbp = new SnowballProgram(); + I_p2 = 0; + I_p1 = 0; + I_pV = 0; + } + + public override string Stem(string w) + { + sbp.SetCurrent(w); + StemImpl(); + return sbp.GetCurrent(); + } + + private bool habr1(string c1, string c2, int v_1) + { + if (sbp.EqualsSegment(1, c1)) + { + sbp.ket = sbp.cursor; + if (sbp.InGrouping(g_v, 97, 249)) + { + sbp.SliceFrom(c2); + sbp.cursor = v_1; + return true; + } + } + + return false; + } + + private void r_prelude() + { + var v_1 = sbp.cursor; + while (true) + { + sbp.bra = sbp.cursor; + var among_var = sbp.FindAmong(a_0, 7); + if (among_var != 0) /* !! */ + { + sbp.ket = sbp.cursor; + switch (among_var) + { + case 1: + sbp.SliceFrom(@"à"); + continue; + case 2: + sbp.SliceFrom(@"è"); + continue; + case 3: + sbp.SliceFrom(@"ì"); + continue; + case 4: + sbp.SliceFrom(@"ò"); + continue; + case 5: + sbp.SliceFrom(@"ù"); + continue; + case 6: + sbp.SliceFrom("qU"); + continue; + case 7: + if (sbp.cursor >= sbp.limit) + { + break; + } + + sbp.cursor++; + continue; + } + } + + break; + } + + sbp.cursor = v_1; + while (true) + { + var v_2 = sbp.cursor; + while (true) + { + var v_3 = sbp.cursor; + if (sbp.InGrouping(g_v, 97, 249)) + { + sbp.bra = sbp.cursor; + var v_4 = sbp.cursor; + if (habr1("u", "U", v_3)) + { + break; + } + + sbp.cursor = v_4; + if (habr1("i", "I", v_3)) + { + break; + } + } + + sbp.cursor = v_3; + if (sbp.cursor >= sbp.limit) + { + sbp.cursor = v_2; + return; + } + + sbp.cursor++; + } + } + } + + private bool habr2(int v_1) + { + sbp.cursor = v_1; + if (!sbp.InGrouping(g_v, 97, 249)) + { + return false; + } + + while (!sbp.OutGrouping(g_v, 97, 249)) + { + if (sbp.cursor >= sbp.limit) + { + return false; + } + + sbp.cursor++; + } + + return true; + } + + private bool habr3() + { + if (sbp.InGrouping(g_v, 97, 249)) + { + var v_1 = sbp.cursor; + if (sbp.OutGrouping(g_v, 97, 249)) + { + while (!sbp.InGrouping(g_v, 97, 249)) + { + if (sbp.cursor >= sbp.limit) + { + return habr2(v_1); + } + + sbp.cursor++; + } + + return true; + } + + return habr2(v_1); + } + + return false; + } + + private void habr4() + { + var v_1 = sbp.cursor; + if (!habr3()) + { + sbp.cursor = v_1; + if (!sbp.OutGrouping(g_v, 97, 249)) + { + return; + } + + var v_2 = sbp.cursor; + if (sbp.OutGrouping(g_v, 97, 249)) + { + while (!sbp.InGrouping(g_v, 97, 249)) + { + if (sbp.cursor >= sbp.limit) + { + sbp.cursor = v_2; + if (sbp.InGrouping(g_v, 97, 249) && + sbp.cursor < sbp.limit) + { + sbp.cursor++; + } + + return; + } + + sbp.cursor++; + } + + I_pV = sbp.cursor; + return; + } + + sbp.cursor = v_2; + if (!sbp.InGrouping(g_v, 97, 249) || sbp.cursor >= sbp.limit) + { + return; + } + + sbp.cursor++; + } + + I_pV = sbp.cursor; + } + + private bool habr5() + { + while (!sbp.InGrouping(g_v, 97, 249)) + { + if (sbp.cursor >= sbp.limit) + { + return false; + } + + sbp.cursor++; + } + + while (!sbp.OutGrouping(g_v, 97, 249)) + { + if (sbp.cursor >= sbp.limit) + { + return false; + } + + sbp.cursor++; + } + + return true; + } + + private void r_mark_regions() + { + var v_1 = sbp.cursor; + I_pV = sbp.limit; + I_p1 = I_pV; + I_p2 = I_pV; + habr4(); + sbp.cursor = v_1; + if (habr5()) + { + I_p1 = sbp.cursor; + if (habr5()) + { + I_p2 = sbp.cursor; + } + } + } + + private void r_postlude() + { + while (true) + { + sbp.bra = sbp.cursor; + var among_var = sbp.FindAmong(a_1, 3); + if (among_var == 0) /* !! */ + { + break; + } + + sbp.ket = sbp.cursor; + switch (among_var) + { + case 1: + sbp.SliceFrom("i"); + break; + case 2: + sbp.SliceFrom("u"); + break; + case 3: + if (sbp.cursor >= sbp.limit) + { + return; + } + + sbp.cursor++; + break; + } + } + } + + private bool r_RV() + { + return I_pV <= sbp.cursor; + } + + private bool r_R1() + { + return I_p1 <= sbp.cursor; + } + + private bool r_R2() + { + return I_p2 <= sbp.cursor; + } + + private void r_attached_pronoun() + { + sbp.ket = sbp.cursor; + if (sbp.FindAmongBackwards(a_2, 37) != 0) /* !! */ + { + sbp.bra = sbp.cursor; + var among_var = sbp.FindAmongBackwards(a_3, 5); + if (among_var != 0 /* !! */ && r_RV()) + { + switch (among_var) + { + case 1: + sbp.SliceDelete(); + break; + case 2: + sbp.SliceFrom("e"); + break; + } + } + } + } + + private bool r_standard_suffix() + { + sbp.ket = sbp.cursor; + var among_var = sbp.FindAmongBackwards(a_6, 51); + if (among_var == 0) /* !! */ + { + return false; + } + + sbp.bra = sbp.cursor; + switch (among_var) + { + case 1: + if (!r_R2()) + { + return false; + } + + sbp.SliceDelete(); + break; + case 2: + if (!r_R2()) + { + return false; + } + + sbp.SliceDelete(); + sbp.ket = sbp.cursor; + if (sbp.EqualsSegmentBackwards(2, "ic")) + { + sbp.bra = sbp.cursor; + if (r_R2()) + { + sbp.SliceDelete(); + } + } + + break; + case 3: + if (!r_R2()) + { + return false; + } + + sbp.SliceFrom("log"); + break; + case 4: + if (!r_R2()) + { + return false; + } + + sbp.SliceFrom("u"); + break; + case 5: + if (!r_R2()) + { + return false; + } + + sbp.SliceFrom("ente"); + break; + case 6: + if (!r_RV()) + { + return false; + } + + sbp.SliceDelete(); + break; + case 7: + if (!r_R1()) + { + return false; + } + + sbp.SliceDelete(); + sbp.ket = sbp.cursor; + among_var = sbp.FindAmongBackwards(a_4, 4); + if (among_var != 0) /* !! */ + { + sbp.bra = sbp.cursor; + if (r_R2()) + { + sbp.SliceDelete(); + if (among_var == 1) + { + sbp.ket = sbp.cursor; + if (sbp.EqualsSegmentBackwards(2, "at")) + { + sbp.bra = sbp.cursor; + if (r_R2()) + { + sbp.SliceDelete(); + } + } + } + } + } + + break; + case 8: + if (!r_R2()) + { + return false; + } + + sbp.SliceDelete(); + sbp.ket = sbp.cursor; + among_var = sbp.FindAmongBackwards(a_5, 3); + if (among_var != 0) /* !! */ + { + sbp.bra = sbp.cursor; + if (among_var == 1) + { + if (r_R2()) + { + sbp.SliceDelete(); + } + } + } + + break; + case 9: + if (!r_R2()) + { + return false; + } + + sbp.SliceDelete(); + sbp.ket = sbp.cursor; + if (sbp.EqualsSegmentBackwards(2, "at")) + { + sbp.bra = sbp.cursor; + if (r_R2()) + { + sbp.SliceDelete(); + sbp.ket = sbp.cursor; + if (sbp.EqualsSegmentBackwards(2, "ic")) + { + sbp.bra = sbp.cursor; + if (r_R2()) + { + sbp.SliceDelete(); + } + } + } + } + + break; + } + + return true; + } + + private void r_verb_suffix() + { + if (sbp.cursor >= I_pV) + { + var v_1 = sbp.limit_backward; + sbp.limit_backward = I_pV; + sbp.ket = sbp.cursor; + var among_var = sbp.FindAmongBackwards(a_7, 87); + if (among_var != 0) /* !! */ + { + sbp.bra = sbp.cursor; + if (among_var == 1) + { + sbp.SliceDelete(); + } + } + + sbp.limit_backward = v_1; + } + } + + private void habr6() + { + var v_1 = sbp.limit - sbp.cursor; + sbp.ket = sbp.cursor; + if (sbp.InGroupingBackwards(g_AEIO, 97, 242)) + { + sbp.bra = sbp.cursor; + if (r_RV()) + { + sbp.SliceDelete(); + sbp.ket = sbp.cursor; + if (sbp.EqualsSegmentBackwards(1, "i")) + { + sbp.bra = sbp.cursor; + if (r_RV()) + { + sbp.SliceDelete(); + return; + } + } + } + } + + sbp.cursor = sbp.limit - v_1; + } + + private void r_vowel_suffix() + { + habr6(); + sbp.ket = sbp.cursor; + if (sbp.EqualsSegmentBackwards(1, "h")) + { + sbp.bra = sbp.cursor; + if (sbp.InGroupingBackwards(g_CG, 99, 103)) + { + if (r_RV()) + { + sbp.SliceDelete(); + } + } + } + } + + private void StemImpl() + { + var v_1 = sbp.cursor; + r_prelude(); + sbp.cursor = v_1; + r_mark_regions(); + sbp.limit_backward = v_1; + sbp.cursor = sbp.limit; + r_attached_pronoun(); + sbp.cursor = sbp.limit; + if (!r_standard_suffix()) + { + sbp.cursor = sbp.limit; + r_verb_suffix(); + } + + sbp.cursor = sbp.limit; + r_vowel_suffix(); + sbp.cursor = sbp.limit_backward; + r_postlude(); + } + + #region Data + + private static readonly Among[] a_0 = + { + new Among("", -1, 7), + new Among("qu", 0, 6), + new Among(@"á", 0, 1), + new Among(@"é", 0, 2), + new Among(@"í", 0, 3), + new Among(@"ó", 0, 4), + new Among(@"ú", 0, 5) + }; + + private static readonly Among[] a_1 = + { + new Among("", -1, 3), + new Among("I", 0, 1), + new Among("U", 0, 2) + }; + + private static readonly Among[] a_2 = + { + new Among("la", -1, -1), + new Among("cela", 0, -1), + new Among("gliela", 0, -1), + new Among("mela", 0, -1), + new Among("tela", 0, -1), + new Among("vela", 0, -1), + new Among("le", -1, -1), + new Among("cele", 6, -1), + new Among("gliele", 6, -1), + new Among("mele", 6, -1), + new Among("tele", 6, -1), + new Among("vele", 6, -1), + new Among("ne", -1, -1), + new Among("cene", 12, -1), + new Among("gliene", 12, -1), + new Among("mene", 12, -1), + new Among("sene", 12, -1), + new Among("tene", 12, -1), + new Among("vene", 12, -1), + new Among("ci", -1, -1), + new Among("li", -1, -1), + new Among("celi", 20, -1), + new Among("glieli", 20, -1), + new Among("meli", 20, -1), + new Among("teli", 20, -1), + new Among("veli", 20, -1), + new Among("gli", 20, -1), + new Among("mi", -1, -1), + new Among("si", -1, -1), + new Among("ti", -1, -1), + new Among("vi", -1, -1), + new Among("lo", -1, -1), + new Among("celo", 31, -1), + new Among("glielo", 31, -1), + new Among("melo", 31, -1), + new Among("telo", 31, -1), + new Among("velo", 31, -1) + }; + + private static readonly Among[] a_3 = + { + new Among("ando", -1, 1), + new Among("endo", -1, 1), + new Among("ar", -1, 2), + new Among("er", -1, 2), + new Among("ir", -1, 2) + }; + + private static readonly Among[] a_4 = + { + new Among("ic", -1, -1), + new Among("abil", -1, -1), + new Among("os", -1, -1), + new Among("iv", -1, 1) + }; + + private static readonly Among[] a_5 = + { + new Among("ic", -1, 1), + new Among("abil", -1, 1), + new Among("iv", -1, 1) + }; + + private static readonly Among[] a_6 = + { + new Among("ica", -1, 1), + new Among("logia", -1, 3), + new Among("osa", -1, 1), + new Among("ista", -1, 1), + new Among("iva", -1, 9), + new Among("anza", -1, 1), + new Among("enza", -1, 5), + new Among("ice", -1, 1), + new Among("atrice", 7, 1), + new Among("iche", -1, 1), + new Among("logie", -1, 3), + new Among("abile", -1, 1), + new Among("ibile", -1, 1), + new Among("usione", -1, 4), + new Among("azione", -1, 2), + new Among("uzione", -1, 4), + new Among("atore", -1, 2), + new Among("ose", -1, 1), + new Among("ante", -1, 1), + new Among("mente", -1, 1), + new Among("amente", 19, 7), + new Among("iste", -1, 1), + new Among("ive", -1, 9), + new Among("anze", -1, 1), + new Among("enze", -1, 5), + new Among("ici", -1, 1), + new Among("atrici", 25, 1), + new Among("ichi", -1, 1), + new Among("abili", -1, 1), + new Among("ibili", -1, 1), + new Among("ismi", -1, 1), + new Among("usioni", -1, 4), + new Among("azioni", -1, 2), + new Among("uzioni", -1, 4), + new Among("atori", -1, 2), + new Among("osi", -1, 1), + new Among("anti", -1, 1), + new Among("amenti", -1, 6), + new Among("imenti", -1, 6), + new Among("isti", -1, 1), + new Among("ivi", -1, 9), + new Among("ico", -1, 1), + new Among("ismo", -1, 1), + new Among("oso", -1, 1), + new Among("amento", -1, 6), + new Among("imento", -1, 6), + new Among("ivo", -1, 9), + new Among(@"ità", -1, 8), + new Among(@"istà", -1, 1), + new Among(@"istè", -1, 1), + new Among(@"istì", -1, 1) + }; + + private static readonly Among[] a_7 = + { + new Among("isca", -1, 1), + new Among("enda", -1, 1), + new Among("ata", -1, 1), + new Among("ita", -1, 1), + new Among("uta", -1, 1), + new Among("ava", -1, 1), + new Among("eva", -1, 1), + new Among("iva", -1, 1), + new Among("erebbe", -1, 1), + new Among("irebbe", -1, 1), + new Among("isce", -1, 1), + new Among("ende", -1, 1), + new Among("are", -1, 1), + new Among("ere", -1, 1), + new Among("ire", -1, 1), + new Among("asse", -1, 1), + new Among("ate", -1, 1), + new Among("avate", 16, 1), + new Among("evate", 16, 1), + new Among("ivate", 16, 1), + new Among("ete", -1, 1), + new Among("erete", 20, 1), + new Among("irete", 20, 1), + new Among("ite", -1, 1), + new Among("ereste", -1, 1), + new Among("ireste", -1, 1), + new Among("ute", -1, 1), + new Among("erai", -1, 1), + new Among("irai", -1, 1), + new Among("isci", -1, 1), + new Among("endi", -1, 1), + new Among("erei", -1, 1), + new Among("irei", -1, 1), + new Among("assi", -1, 1), + new Among("ati", -1, 1), + new Among("iti", -1, 1), + new Among("eresti", -1, 1), + new Among("iresti", -1, 1), + new Among("uti", -1, 1), + new Among("avi", -1, 1), + new Among("evi", -1, 1), + new Among("ivi", -1, 1), + new Among("isco", -1, 1), + new Among("ando", -1, 1), + new Among("endo", -1, 1), + new Among("Yamo", -1, 1), + new Among("iamo", -1, 1), + new Among("avamo", -1, 1), + new Among("evamo", -1, 1), + new Among("ivamo", -1, 1), + new Among("eremo", -1, 1), + new Among("iremo", -1, 1), + new Among("assimo", -1, 1), + new Among("ammo", -1, 1), + new Among("emmo", -1, 1), + new Among("eremmo", 54, 1), + new Among("iremmo", 54, 1), + new Among("immo", -1, 1), + new Among("ano", -1, 1), + new Among("iscano", 58, 1), + new Among("avano", 58, 1), + new Among("evano", 58, 1), + new Among("ivano", 58, 1), + new Among("eranno", -1, 1), + new Among("iranno", -1, 1), + new Among("ono", -1, 1), + new Among("iscono", 65, 1), + new Among("arono", 65, 1), + new Among("erono", 65, 1), + new Among("irono", 65, 1), + new Among("erebbero", -1, 1), + new Among("irebbero", -1, 1), + new Among("assero", -1, 1), + new Among("essero", -1, 1), + new Among("issero", -1, 1), + new Among("ato", -1, 1), + new Among("ito", -1, 1), + new Among("uto", -1, 1), + new Among("avo", -1, 1), + new Among("evo", -1, 1), + new Among("ivo", -1, 1), + new Among("ar", -1, 1), + new Among("ir", -1, 1), + new Among(@"erà", -1, 1), + new Among(@"irà", -1, 1), + new Among(@"erò", -1, 1), + new Among(@"irò", -1, 1) + }; + + private static readonly int[] g_v = {17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 128, 8, 2, 1}; + private static readonly int[] g_AEIO = {17, 65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 128, 8, 2}; + private static readonly int[] g_CG = {17}; + + #endregion + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/it/ItalianStopWordFilter.cs b/LunrCore/Globalization/it/ItalianStopWordFilter.cs new file mode 100644 index 0000000..9c1fe23 --- /dev/null +++ b/LunrCore/Globalization/it/ItalianStopWordFilter.cs @@ -0,0 +1,15 @@ +using System; + +namespace Lunr.Globalization.it +{ + public sealed class ItalianStopWordFilter : StopWordFilterBase + { + private const string Data = + @"a abbia abbiamo abbiano abbiate ad agl agli ai al all alla alle allo anche avemmo avendo avesse avessero avessi avessimo aveste avesti avete aveva avevamo avevano avevate avevi avevo avrai avranno avrebbe avrebbero avrei avremmo avremo avreste avresti avrete avrà avrò avuta avute avuti avuto c che chi ci coi col come con contro cui da dagl dagli dai dal dall dalla dalle dallo degl degli dei del dell della delle dello di dov dove e ebbe ebbero ebbi ed era erano eravamo eravate eri ero essendo faccia facciamo facciano facciate faccio facemmo facendo facesse facessero facessi facessimo faceste facesti faceva facevamo facevano facevate facevi facevo fai fanno farai faranno farebbe farebbero farei faremmo faremo fareste faresti farete farà farò fece fecero feci fosse fossero fossi fossimo foste fosti fu fui fummo furono gli ha hai hanno ho i il in io l la le lei li lo loro lui ma mi mia mie miei mio ne negl negli nei nel nell nella nelle nello noi non nostra nostre nostri nostro o per perché più quale quanta quante quanti quanto quella quelle quelli quello questa queste questi questo sarai saranno sarebbe sarebbero sarei saremmo saremo sareste saresti sarete sarà sarò se sei si sia siamo siano siate siete sono sta stai stando stanno starai staranno starebbe starebbero starei staremmo staremo stareste staresti starete starà starò stava stavamo stavano stavate stavi stavo stemmo stesse stessero stessi stessimo steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua sue sugl sugli sui sul sull sulla sulle sullo suo suoi ti tra tu tua tue tuo tuoi tutti tutto un una uno vi voi vostra vostre vostri vostro è"; + + private static readonly ISet WordList = + new Set(Data.Split(new[] {" "}, StringSplitOptions.RemoveEmptyEntries)); + + protected override ISet StopWords => WordList; + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/it/ItalianTrimmer.cs b/LunrCore/Globalization/it/ItalianTrimmer.cs new file mode 100644 index 0000000..c90bf23 --- /dev/null +++ b/LunrCore/Globalization/it/ItalianTrimmer.cs @@ -0,0 +1,10 @@ +using System.Text.RegularExpressions; + +namespace Lunr.Globalization.it +{ + public sealed class ItalianTrimmer : TrimmerBase + { + private static readonly Regex Pattern = new Regex("(^[^" + WordCharacters.Italian + "]+|[^" + WordCharacters.Italian + "]+$)", RegexOptions.Compiled); + public override string Trim(string s) => Pattern.Replace(s, ""); + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/jp/JapaneseStopWordFilter.cs b/LunrCore/Globalization/jp/JapaneseStopWordFilter.cs new file mode 100644 index 0000000..4c6364e --- /dev/null +++ b/LunrCore/Globalization/jp/JapaneseStopWordFilter.cs @@ -0,0 +1,15 @@ +using System; + +namespace Lunr.Globalization.jp +{ + public sealed class JapaneseStopWordFilter : StopWordFilterBase + { + private const string Data = + @"これ それ あれ この その あの ここ そこ あそこ こちら どこ だれ なに なん 何 私 貴方 貴方方 我々 私達 あの人 あのかた 彼女 彼 です あります おります います は が の に を で え から まで より も どの と し それで しかし"; + + private static readonly ISet WordList = + new Set(Data.Split(new[] {" "}, StringSplitOptions.RemoveEmptyEntries)); + + protected override ISet StopWords => WordList; + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/jp/JapaneseTrimmer.cs b/LunrCore/Globalization/jp/JapaneseTrimmer.cs new file mode 100644 index 0000000..6db319f --- /dev/null +++ b/LunrCore/Globalization/jp/JapaneseTrimmer.cs @@ -0,0 +1,10 @@ +using System.Text.RegularExpressions; + +namespace Lunr.Globalization.jp +{ + public sealed class JapaneseTrimmer : TrimmerBase + { + private static readonly Regex Pattern = new Regex("(^[^" + WordCharacters.Japanese + "]+|[^" + WordCharacters.Japanese + "]+$)", RegexOptions.Compiled); + public override string Trim(string s) => Pattern.Replace(s, ""); + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/nl/DutchStopWordFilter.cs b/LunrCore/Globalization/nl/DutchStopWordFilter.cs new file mode 100644 index 0000000..63c5fc8 --- /dev/null +++ b/LunrCore/Globalization/nl/DutchStopWordFilter.cs @@ -0,0 +1,15 @@ +using System; + +namespace Lunr.Globalization.nl +{ + public sealed class DutchStopWordFilter : StopWordFilterBase + { + private const string Data = + @" aan al alles als altijd andere ben bij daar dan dat de der deze die dit doch doen door dus een eens en er ge geen geweest haar had heb hebben heeft hem het hier hij hoe hun iemand iets ik in is ja je kan kon kunnen maar me meer men met mij mijn moet na naar niet niets nog nu of om omdat onder ons ook op over reeds te tegen toch toen tot u uit uw van veel voor want waren was wat werd wezen wie wil worden wordt zal ze zelf zich zij zijn zo zonder zou"; + + private static readonly ISet WordList = + new Set(Data.Split(new[] {" "}, StringSplitOptions.RemoveEmptyEntries)); + + protected override ISet StopWords => WordList; + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/nl/DutchTrimmer.cs b/LunrCore/Globalization/nl/DutchTrimmer.cs new file mode 100644 index 0000000..9785bd2 --- /dev/null +++ b/LunrCore/Globalization/nl/DutchTrimmer.cs @@ -0,0 +1,10 @@ +using System.Text.RegularExpressions; + +namespace Lunr.Globalization.nl +{ + public sealed class DutchTrimmer : TrimmerBase + { + private static readonly Regex Pattern = new Regex("(^[^" + WordCharacters.Dutch + "]+|[^" + WordCharacters.Dutch + "]+$)", RegexOptions.Compiled); + public override string Trim(string s) => Pattern.Replace(s, ""); + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/no/NorwegianStopWordFilter.cs b/LunrCore/Globalization/no/NorwegianStopWordFilter.cs new file mode 100644 index 0000000..030519c --- /dev/null +++ b/LunrCore/Globalization/no/NorwegianStopWordFilter.cs @@ -0,0 +1,15 @@ +using System; + +namespace Lunr.Globalization.no +{ + public sealed class NorwegianStopWordFilter : StopWordFilterBase + { + private const string Data = + @"alle at av bare begge ble blei bli blir blitt både båe da de deg dei deim deira deires dem den denne der dere deres det dette di din disse ditt du dykk dykkar då eg ein eit eitt eller elles en enn er et ett etter for fordi fra før ha hadde han hans har hennar henne hennes her hjå ho hoe honom hoss hossen hun hva hvem hver hvilke hvilken hvis hvor hvordan hvorfor i ikke ikkje ikkje ingen ingi inkje inn inni ja jeg kan kom korleis korso kun kunne kva kvar kvarhelst kven kvi kvifor man mange me med medan meg meget mellom men mi min mine mitt mot mykje ned no noe noen noka noko nokon nokor nokre nå når og også om opp oss over på samme seg selv si si sia sidan siden sin sine sitt sjøl skal skulle slik so som som somme somt så sånn til um upp ut uten var vart varte ved vere verte vi vil ville vore vors vort vår være være vært å"; + + private static readonly ISet WordList = + new Set(Data.Split(new[] {" "}, StringSplitOptions.RemoveEmptyEntries)); + + protected override ISet StopWords => WordList; + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/no/NorwegianTrimmer.cs b/LunrCore/Globalization/no/NorwegianTrimmer.cs new file mode 100644 index 0000000..442cd30 --- /dev/null +++ b/LunrCore/Globalization/no/NorwegianTrimmer.cs @@ -0,0 +1,10 @@ +using System.Text.RegularExpressions; + +namespace Lunr.Globalization.no +{ + public sealed class NorwegianTrimmer : TrimmerBase + { + private static readonly Regex Pattern = new Regex("(^[^" + WordCharacters.Norwegian + "]+|[^" + WordCharacters.Norwegian + "]+$)", RegexOptions.Compiled); + public override string Trim(string s) => Pattern.Replace(s, ""); + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/pt/PortugueseStopWordFilter.cs b/LunrCore/Globalization/pt/PortugueseStopWordFilter.cs new file mode 100644 index 0000000..0fa0d2c --- /dev/null +++ b/LunrCore/Globalization/pt/PortugueseStopWordFilter.cs @@ -0,0 +1,15 @@ +using System; + +namespace Lunr.Globalization.pt +{ + public sealed class PortugueseStopWordFilter : StopWordFilterBase + { + private const string Data = + @"a ao aos aquela aquelas aquele aqueles aquilo as até com como da das de dela delas dele deles depois do dos e ela elas ele eles em entre era eram essa essas esse esses esta estamos estas estava estavam este esteja estejam estejamos estes esteve estive estivemos estiver estivera estiveram estiverem estivermos estivesse estivessem estivéramos estivéssemos estou está estávamos estão eu foi fomos for fora foram forem formos fosse fossem fui fôramos fôssemos haja hajam hajamos havemos hei houve houvemos houver houvera houveram houverei houverem houveremos houveria houveriam houvermos houverá houverão houveríamos houvesse houvessem houvéramos houvéssemos há hão isso isto já lhe lhes mais mas me mesmo meu meus minha minhas muito na nas nem no nos nossa nossas nosso nossos num numa não nós o os ou para pela pelas pelo pelos por qual quando que quem se seja sejam sejamos sem serei seremos seria seriam será serão seríamos seu seus somos sou sua suas são só também te tem temos tenha tenham tenhamos tenho terei teremos teria teriam terá terão teríamos teu teus teve tinha tinham tive tivemos tiver tivera tiveram tiverem tivermos tivesse tivessem tivéramos tivéssemos tu tua tuas tém tínhamos um uma você vocês vos à às éramos"; + + private static readonly ISet WordList = + new Set(Data.Split(new[] {" "}, StringSplitOptions.RemoveEmptyEntries)); + + protected override ISet StopWords => WordList; + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/pt/PortugueseTrimmer.cs b/LunrCore/Globalization/pt/PortugueseTrimmer.cs new file mode 100644 index 0000000..6a1e976 --- /dev/null +++ b/LunrCore/Globalization/pt/PortugueseTrimmer.cs @@ -0,0 +1,10 @@ +using System.Text.RegularExpressions; + +namespace Lunr.Globalization.pt +{ + public sealed class PortugueseTrimmer : TrimmerBase + { + private static readonly Regex Pattern = new Regex("(^[^" + WordCharacters.Portuguese + "]+|[^" + WordCharacters.Portuguese + "]+$)", RegexOptions.Compiled); + public override string Trim(string s) => Pattern.Replace(s, ""); + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/ro/RomanianStopWordFilter.cs b/LunrCore/Globalization/ro/RomanianStopWordFilter.cs new file mode 100644 index 0000000..881c3b9 --- /dev/null +++ b/LunrCore/Globalization/ro/RomanianStopWordFilter.cs @@ -0,0 +1,15 @@ +using System; + +namespace Lunr.Globalization.ro +{ + public sealed class RomanianStopWordFilter : StopWordFilterBase + { + private const string Data = + @"acea aceasta această aceea acei aceia acel acela acele acelea acest acesta aceste acestea aceşti aceştia acolo acord acum ai aia aibă aici al ale alea altceva altcineva am ar are asemenea asta astea astăzi asupra au avea avem aveţi azi aş aşadar aţi bine bucur bună ca care caut ce cel ceva chiar cinci cine cineva contra cu cum cumva curând curînd când cât câte câtva câţi cînd cît cîte cîtva cîţi că căci cărei căror cărui către da dacă dar datorită dată dau de deci deja deoarece departe deşi din dinaintea dintr- dintre doi doilea două drept după dă ea ei el ele eram este eu eşti face fata fi fie fiecare fii fim fiu fiţi frumos fără graţie halbă iar ieri la le li lor lui lângă lîngă mai mea mei mele mereu meu mi mie mine mult multă mulţi mulţumesc mâine mîine mă ne nevoie nici nicăieri nimeni nimeri nimic nişte noastre noastră noi noroc nostru nouă noştri nu opt ori oricare orice oricine oricum oricând oricât oricînd oricît oriunde patra patru patrulea pe pentru peste pic poate pot prea prima primul prin puţin puţina puţină până pînă rog sa sale sau se spate spre sub sunt suntem sunteţi sută sînt sîntem sînteţi să săi său ta tale te timp tine toate toată tot totuşi toţi trei treia treilea tu tăi tău un una unde undeva unei uneia unele uneori unii unor unora unu unui unuia unul vi voastre voastră voi vostru vouă voştri vreme vreo vreun vă zece zero zi zice îi îl îmi împotriva în înainte înaintea încotro încât încît între întrucât întrucît îţi ăla ălea ăsta ăstea ăştia şapte şase şi ştiu ţi ţie"; + + private static readonly ISet WordList = + new Set(Data.Split(new[] {" "}, StringSplitOptions.RemoveEmptyEntries)); + + protected override ISet StopWords => WordList; + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/ro/RomanianTrimmer.cs b/LunrCore/Globalization/ro/RomanianTrimmer.cs new file mode 100644 index 0000000..6cb6d37 --- /dev/null +++ b/LunrCore/Globalization/ro/RomanianTrimmer.cs @@ -0,0 +1,10 @@ +using System.Text.RegularExpressions; + +namespace Lunr.Globalization.ro +{ + public sealed class RomanianTrimmer : TrimmerBase + { + private static readonly Regex Pattern = new Regex("(^[^" + WordCharacters.Romanian + "]+|[^" + WordCharacters.Romanian + "]+$)", RegexOptions.Compiled); + public override string Trim(string s) => Pattern.Replace(s, ""); + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/ru/RussianStopWordFilter.cs b/LunrCore/Globalization/ru/RussianStopWordFilter.cs new file mode 100644 index 0000000..9295ba9 --- /dev/null +++ b/LunrCore/Globalization/ru/RussianStopWordFilter.cs @@ -0,0 +1,15 @@ +using System; + +namespace Lunr.Globalization.ru +{ + public sealed class RussianStopWordFilter : StopWordFilterBase + { + private const string Data = + @"алло без близко более больше будем будет будете будешь будто буду будут будь бы бывает бывь был была были было быть в важная важное важные важный вам вами вас ваш ваша ваше ваши вверх вдали вдруг ведь везде весь вниз внизу во вокруг вон восемнадцатый восемнадцать восемь восьмой вот впрочем времени время все всегда всего всем всеми всему всех всею всю всюду вся всё второй вы г где говорил говорит год года году да давно даже далеко дальше даром два двадцатый двадцать две двенадцатый двенадцать двух девятнадцатый девятнадцать девятый девять действительно дел день десятый десять для до довольно долго должно другая другие других друго другое другой е его ее ей ему если есть еще ещё ею её ж же жизнь за занят занята занято заняты затем зато зачем здесь значит и из или им именно иметь ими имя иногда их к каждая каждое каждые каждый кажется как какая какой кем когда кого ком кому конечно которая которого которой которые который которых кроме кругом кто куда лет ли лишь лучше люди м мало между меля менее меньше меня миллионов мимо мира мне много многочисленная многочисленное многочисленные многочисленный мной мною мог могут мож может можно можхо мои мой мор мочь моя моё мы на наверху над надо назад наиболее наконец нам нами нас начала наш наша наше наши не него недавно недалеко нее ней нельзя нем немного нему непрерывно нередко несколько нет нею неё ни нибудь ниже низко никогда никуда ними них ничего но ну нужно нх о об оба обычно один одиннадцатый одиннадцать однажды однако одного одной около он она они оно опять особенно от отовсюду отсюда очень первый перед по под пожалуйста позже пока пор пора после посреди потом потому почему почти прекрасно при про просто против процентов пятнадцатый пятнадцать пятый пять раз разве рано раньше рядом с сам сама сами самим самими самих само самого самой самом самому саму свое своего своей свои своих свою сеаой себе себя сегодня седьмой сейчас семнадцатый семнадцать семь сих сказал сказала сказать сколько слишком сначала снова со собой собою совсем спасибо стал суть т та так такая также такие такое такой там твой твоя твоё те тебе тебя тем теми теперь тех то тобой тобою тогда того тоже только том тому тот тою третий три тринадцатый тринадцать ту туда тут ты тысяч у уж уже уметь хорошо хотеть хоть хотя хочешь часто чаще чего человек чем чему через четвертый четыре четырнадцатый четырнадцать что чтоб чтобы чуть шестнадцатый шестнадцать шестой шесть эта эти этим этими этих это этого этой этом этому этот эту я а"; + + private static readonly ISet WordList = + new Set(Data.Split(new[] {" "}, StringSplitOptions.RemoveEmptyEntries)); + + protected override ISet StopWords => WordList; + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/ru/RussianTrimmer.cs b/LunrCore/Globalization/ru/RussianTrimmer.cs new file mode 100644 index 0000000..cd9b134 --- /dev/null +++ b/LunrCore/Globalization/ru/RussianTrimmer.cs @@ -0,0 +1,10 @@ +using System.Text.RegularExpressions; + +namespace Lunr.Globalization.ru +{ + public sealed class RussianTrimmer : TrimmerBase + { + private static readonly Regex Pattern = new Regex("(^[^" + WordCharacters.Russian + "]+|[^" + WordCharacters.Russian + "]+$)", RegexOptions.Compiled); + public override string Trim(string s) => Pattern.Replace(s, ""); + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/sv/SwedishStopWordFilter.cs b/LunrCore/Globalization/sv/SwedishStopWordFilter.cs new file mode 100644 index 0000000..9f5f6e9 --- /dev/null +++ b/LunrCore/Globalization/sv/SwedishStopWordFilter.cs @@ -0,0 +1,15 @@ +using System; + +namespace Lunr.Globalization.sv +{ + public sealed class SwedishStopWordFilter : StopWordFilterBase + { + private const string Data = + @"alla allt att av blev bli blir blivit de dem den denna deras dess dessa det detta dig din dina ditt du där då efter ej eller en er era ert ett från för ha hade han hans har henne hennes hon honom hur här i icke ingen inom inte jag ju kan kunde man med mellan men mig min mina mitt mot mycket ni nu när någon något några och om oss på samma sedan sig sin sina sitta själv skulle som så sådan sådana sådant till under upp ut utan vad var vara varför varit varje vars vart vem vi vid vilka vilkas vilken vilket vår våra vårt än är åt över"; + + private static readonly ISet WordList = + new Set(Data.Split(new[] {" "}, StringSplitOptions.RemoveEmptyEntries)); + + protected override ISet StopWords => WordList; + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/sv/SwedishTrimmer.cs b/LunrCore/Globalization/sv/SwedishTrimmer.cs new file mode 100644 index 0000000..b98c73b --- /dev/null +++ b/LunrCore/Globalization/sv/SwedishTrimmer.cs @@ -0,0 +1,10 @@ +using System.Text.RegularExpressions; + +namespace Lunr.Globalization.sv +{ + public sealed class SwedishTrimmer : TrimmerBase + { + private static readonly Regex Pattern = new Regex("(^[^" + WordCharacters.Swedish + "]+|[^" + WordCharacters.Swedish + "]+$)", RegexOptions.Compiled); + public override string Trim(string s) => Pattern.Replace(s, ""); + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/th/ThaiTrimmer.cs b/LunrCore/Globalization/th/ThaiTrimmer.cs new file mode 100644 index 0000000..252aa3e --- /dev/null +++ b/LunrCore/Globalization/th/ThaiTrimmer.cs @@ -0,0 +1,10 @@ +using System.Text.RegularExpressions; + +namespace Lunr.Globalization.th +{ + public sealed class ThaiTrimmer : TrimmerBase + { + private static readonly Regex Pattern = new Regex("(^[^" + WordCharacters.Thai + "]+|[^" + WordCharacters.Thai + "]+$)", RegexOptions.Compiled); + public override string Trim(string s) => Pattern.Replace(s, ""); + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/tr/TurkishStopWordFilter.cs b/LunrCore/Globalization/tr/TurkishStopWordFilter.cs new file mode 100644 index 0000000..fc4572d --- /dev/null +++ b/LunrCore/Globalization/tr/TurkishStopWordFilter.cs @@ -0,0 +1,15 @@ +using System; + +namespace Lunr.Globalization.tr +{ + public sealed class TurkishStopWordFilter : StopWordFilterBase + { + private const string Data = + @"acaba altmış altı ama ancak arada aslında ayrıca bana bazı belki ben benden beni benim beri beş bile bin bir biri birkaç birkez birçok birşey birşeyi biz bizden bize bizi bizim bu buna bunda bundan bunlar bunları bunların bunu bunun burada böyle böylece da daha dahi de defa değil diye diğer doksan dokuz dolayı dolayısıyla dört edecek eden ederek edilecek ediliyor edilmesi ediyor elli en etmesi etti ettiği ettiğini eğer gibi göre halen hangi hatta hem henüz hep hepsi her herhangi herkesin hiç hiçbir iki ile ilgili ise itibaren itibariyle için işte kadar karşın katrilyon kendi kendilerine kendini kendisi kendisine kendisini kez ki kim kimden kime kimi kimse kırk milyar milyon mu mü mı nasıl ne neden nedenle nerde nerede nereye niye niçin o olan olarak oldu olduklarını olduğu olduğunu olmadı olmadığı olmak olması olmayan olmaz olsa olsun olup olur olursa oluyor on ona ondan onlar onlardan onları onların onu onun otuz oysa pek rağmen sadece sanki sekiz seksen sen senden seni senin siz sizden sizi sizin tarafından trilyon tüm var vardı ve veya ya yani yapacak yapmak yaptı yaptıkları yaptığı yaptığını yapılan yapılması yapıyor yedi yerine yetmiş yine yirmi yoksa yüz zaten çok çünkü öyle üzere üç şey şeyden şeyi şeyler şu şuna şunda şundan şunları şunu şöyle"; + + private static readonly ISet WordList = + new Set(Data.Split(new[] {" "}, StringSplitOptions.RemoveEmptyEntries)); + + protected override ISet StopWords => WordList; + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/tr/TurkishTrimmer.cs b/LunrCore/Globalization/tr/TurkishTrimmer.cs new file mode 100644 index 0000000..7f7f9c3 --- /dev/null +++ b/LunrCore/Globalization/tr/TurkishTrimmer.cs @@ -0,0 +1,10 @@ +using System.Text.RegularExpressions; + +namespace Lunr.Globalization.tr +{ + public sealed class TurkishTrimmer : TrimmerBase + { + private static readonly Regex Pattern = new Regex("(^[^" + WordCharacters.Turkish + "]+|[^" + WordCharacters.Turkish + "]+$)", RegexOptions.Compiled); + public override string Trim(string s) => Pattern.Replace(s, ""); + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/vi/VietnameseStopWordFilter.cs b/LunrCore/Globalization/vi/VietnameseStopWordFilter.cs new file mode 100644 index 0000000..b414da2 --- /dev/null +++ b/LunrCore/Globalization/vi/VietnameseStopWordFilter.cs @@ -0,0 +1,14 @@ +using System; + +namespace Lunr.Globalization.vi +{ + public sealed class VietnameseStopWordFilter : StopWordFilterBase + { + private const string Data = @"là cái nhưng mà"; + + private static readonly ISet WordList = + new Set(Data.Split(new[] {" "}, StringSplitOptions.RemoveEmptyEntries)); + + protected override ISet StopWords => WordList; + } +} \ No newline at end of file diff --git a/LunrCore/Globalization/vi/VietnameseTrimmer.cs b/LunrCore/Globalization/vi/VietnameseTrimmer.cs new file mode 100644 index 0000000..7bbe7e7 --- /dev/null +++ b/LunrCore/Globalization/vi/VietnameseTrimmer.cs @@ -0,0 +1,10 @@ +using System.Text.RegularExpressions; + +namespace Lunr.Globalization.vi +{ + public sealed class VietnameseTrimmer : TrimmerBase + { + private static readonly Regex Pattern = new Regex("(^[^" + WordCharacters.Vietnamese + "]+|[^" + WordCharacters.Vietnamese + "]+$)", RegexOptions.Compiled); + public override string Trim(string s) => Pattern.Replace(s, ""); + } +} \ No newline at end of file diff --git a/LunrCore/LunrCore.csproj b/LunrCore/LunrCore.csproj index 730e486..05519c0 100644 --- a/LunrCore/LunrCore.csproj +++ b/LunrCore/LunrCore.csproj @@ -24,12 +24,12 @@ - + - - + + diff --git a/LunrCoreLmdb/Globalization/fr/LmdbIndex.cs b/LunrCoreLmdb/Globalization/fr/LmdbIndex.cs new file mode 100644 index 0000000..8ff8d36 --- /dev/null +++ b/LunrCoreLmdb/Globalization/fr/LmdbIndex.cs @@ -0,0 +1,89 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Lunr; +using Lunr.Globalization.fr; + +namespace LunrCoreLmdb.Globalization.fr +{ + public static class LmdbIndex + { + /// + /// A convenience function for configuring and constructing + /// a new lunr DelegatedIndex. + /// + /// An `LmdbBuilder` instance is created and the pipeline setup + /// with a trimmer, stop word filter and stemmer. + /// + /// This builder object is yielded to the configuration function + /// that is passed as a parameter, allowing the list of fields + /// and other builder parameters to be customized. + /// + /// All documents _must_ be added within the passed config function. + /// + /// + /// var idx = Index.Build(async builder => + /// { + /// builder + /// .AddField("title") + /// .AddField("body"); + /// + /// builder.ReferenceField = "id"; + /// + /// foreach(Document doc in documents) + /// { + /// builder.add(doc); + /// } + /// }); + /// + /// The directory path to the LMDB database used to store this index. + /// A Configuration function. + /// An optional tokenizer. Default is a ToString() based splitter. + /// An optional pipeline function registry. Default filters through the specific trimmer, stopword filter, and stemmer. + /// An optional indexing pipeline. Default filters through the specific trimmer, stopword filter, and stemmer. + /// An optional search pipeline. Default filters through the stemmer. + /// An optional cancellation token. Default is equivalent to CancellationToken.None. + /// The fields for this builder. + /// The delegated index. + public static async Task Build( + string path, + Func? config = null!, + Tokenizer? tokenizer = null!, + PipelineFunctionRegistry? registry = null!, + IEnumerable? indexingPipeline = null!, + IEnumerable? searchPipeline = null!, + CancellationToken cancellationToken = default, + params Field[] fields) + { + Pipeline.Function trimmerFunction = new FrenchTrimmer().FilterFunction; + Pipeline.Function filterFunction = new FrenchStopWordFilter().FilterFunction; + Pipeline.Function stemmerFunction = new FrenchStemmer().StemmerFunction; + registry ??= new PipelineFunctionRegistry(); + registry.Add("trimmer", trimmerFunction); + registry.Add("stopWordFilter", filterFunction); + registry.Add("stemmer", stemmerFunction); + + Pipeline idxPipeline = indexingPipeline is null + ? new Pipeline(registry, trimmerFunction, filterFunction, stemmerFunction) + : new Pipeline(registry, indexingPipeline.Select(function => registry[function]).ToArray()); + Pipeline srchPipeline = searchPipeline is null + ? new Pipeline(registry, stemmerFunction) + : new Pipeline(registry, searchPipeline.Select(function => registry[function]).ToArray()); + + var builder = new LmdbBuilder( + indexingPipeline: idxPipeline, + searchPipeline: srchPipeline, + tokenizer: tokenizer ?? new Tokenizer(), + fields: fields); + + if (config != null) + { + await config(builder); + } + + return builder.Build(path, cancellationToken); + } + } +} diff --git a/LunrCoreLmdb/Globalization/it/LmdbIndex.cs b/LunrCoreLmdb/Globalization/it/LmdbIndex.cs new file mode 100644 index 0000000..9c204d2 --- /dev/null +++ b/LunrCoreLmdb/Globalization/it/LmdbIndex.cs @@ -0,0 +1,92 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Lunr; +using Lunr.Globalization.it; + +namespace LunrCoreLmdb.Globalization.it +{ + public static class LmdbIndex + { + /// + /// A convenience function for configuring and constructing + /// a new lunr DelegatedIndex. + /// + /// An `LmdbBuilder` instance is created and the pipeline setup + /// with a trimmer, stop word filter and stemmer. + /// + /// This builder object is yielded to the configuration function + /// that is passed as a parameter, allowing the list of fields + /// and other builder parameters to be customized. + /// + /// All documents _must_ be added within the passed config function. + /// + /// + /// var idx = Index.Build(async builder => + /// { + /// builder + /// .AddField("title") + /// .AddField("body"); + /// + /// builder.ReferenceField = "id"; + /// + /// foreach(Document doc in documents) + /// { + /// builder.add(doc); + /// } + /// }); + /// + /// The directory path to the LMDB database used to store this index. + /// An optional trimmer. Default is a regex-based word trimmer. + /// An optional stopword filter. Default is English. + /// An optional stemmer. Default is English. + /// A Configuration function. + /// An optional tokenizer. Default is a ToString() based splitter. + /// An optional pipeline function registry. Default filters through the specific trimmer, stopword filter, and stemmer. + /// An optional indexing pipeline. Default filters through the specific trimmer, stopword filter, and stemmer. + /// An optional search pipeline. Default filters through the stemmer. + /// An optional cancellation token. Default is equivalent to CancellationToken.None. + /// The fields for this builder. + /// The delegated index. + public static async Task Build( + string path, + Func? config = null!, + Tokenizer? tokenizer = null!, + PipelineFunctionRegistry? registry = null!, + IEnumerable? indexingPipeline = null!, + IEnumerable? searchPipeline = null!, + CancellationToken cancellationToken = default, + params Field[] fields) + { + Pipeline.Function trimmerFunction = new ItalianTrimmer().FilterFunction; + Pipeline.Function filterFunction = new ItalianStopWordFilter().FilterFunction; + Pipeline.Function stemmerFunction = new ItalianStemmer().StemmerFunction; + registry ??= new PipelineFunctionRegistry(); + registry.Add("trimmer", trimmerFunction); + registry.Add("stopWordFilter", filterFunction); + registry.Add("stemmer", stemmerFunction); + + Pipeline idxPipeline = indexingPipeline is null + ? new Pipeline(registry, trimmerFunction, filterFunction, stemmerFunction) + : new Pipeline(registry, indexingPipeline.Select(function => registry[function]).ToArray()); + Pipeline srchPipeline = searchPipeline is null + ? new Pipeline(registry, stemmerFunction) + : new Pipeline(registry, searchPipeline.Select(function => registry[function]).ToArray()); + + var builder = new LmdbBuilder( + indexingPipeline: idxPipeline, + searchPipeline: srchPipeline, + tokenizer: tokenizer ?? new Tokenizer(), + fields: fields); + + if (config != null) + { + await config(builder); + } + + return builder.Build(path, cancellationToken); + } + } +} diff --git a/LunrCoreLmdb/LmdbIndex.cs b/LunrCoreLmdb/LmdbIndex.cs index 51ed5e5..bb91aea 100644 --- a/LunrCoreLmdb/LmdbIndex.cs +++ b/LunrCoreLmdb/LmdbIndex.cs @@ -107,7 +107,7 @@ public static async Task Build( return builder.Build(path, cancellationToken); } - + #region Fields public bool AddField(string field, CancellationToken cancellationToken = default) => WithWritableTransaction((db, tx) => diff --git a/LunrCoreLmdbPerf/BlockCopyVsLinqConcat.cs b/LunrCoreLmdbPerf/BlockCopyVsLinqConcat.cs index 4736a03..6120ed2 100644 --- a/LunrCoreLmdbPerf/BlockCopyVsLinqConcat.cs +++ b/LunrCoreLmdbPerf/BlockCopyVsLinqConcat.cs @@ -9,8 +9,8 @@ namespace LunrCoreLmdbPerf [SimpleJob(RunStrategy.Throughput)] public class BlockCopyVsLinqConcat { - private byte[] _left; - private byte[] _right; + private byte[] _left = null!; + private byte[] _right = null!; [GlobalSetup] public void GlobalSetUp() diff --git a/LunrCoreLmdbTests/Globalization/fr/FrenchSearchTests.cs b/LunrCoreLmdbTests/Globalization/fr/FrenchSearchTests.cs new file mode 100644 index 0000000..7b5c977 --- /dev/null +++ b/LunrCoreLmdbTests/Globalization/fr/FrenchSearchTests.cs @@ -0,0 +1,71 @@ +using System; +using System.Collections.Generic; +using System.Threading.Tasks; +using Lunr; +using LunrCoreLmdb; +using Xunit; + +namespace LunrCoreLmdbTests.Globalization.fr +{ + [Collection(nameof(TempDirectory))] + public class FrenchSearchTests : IDisposable + { + private readonly TempDirectory _tempDir; + + public FrenchSearchTests(TempDirectory tempDir) + { + _tempDir = tempDir; + } + + private readonly Document[] _documents = { + new Document + { + { "id", "1" }, + { "title", "France" }, + { "body", "La France Prononciation du titre dans sa version originale Écouter, officiellement la République française Prononciation du titre dans sa version originale Écouter, est un État transcontinental souverain, dont le territoire métropolitain est situé en Europe de l'Ouest. Ce dernier a des frontières terrestres avec la Belgique, le Luxembourg, l'Allemagne, la Suisse, l'Italie, l'Espagne et les principautés d'Andorre et de MonacoN 6,6 et dispose d'importantes façades maritimes dans l'Atlantique, la Manche, la mer du Nord et la Méditerranée. Son territoire ultramarin s'étend dans les océans Indien7, Atlantique8 et Pacifique9 ainsi que sur le continent sud-américain10 et a des frontières terrestres avec le Brésil, le Suriname et le Royaume des Pays-Bas." } + }, + new Document + { + { "id", "2" }, + { "title", "Politique et administration" }, + { "body", "La France est une démocratie libérale, dont le gouvernement a la forme d’une république. Les fondements de l’organisation politique et administrative actuelle de la France ont été fixés en 1958 par la Constitution de la Cinquième République. Selon l’article premier de cette constitution, « la France est une République indivisible, laïque, démocratique et sociale ». Depuis 2003, ce même article affirme en outre que « son organisation est décentralisée continuelle" } + }, + }; + + [Theory] + [InlineData("France", 2)] + [InlineData("gouvernement", 1)] + [InlineData("continuellement", 1)] + [InlineData("inexistent", 0)] + public async Task FindTheWord(string word, int resultCount) + { + using var idx = await GetPlainIndex(); + IList results = await idx.Search(word).ToList(); + Assert.Equal(resultCount, results.Count); + } + + private async Task GetPlainIndex() + { + var idx = await Lunr.Globalization.fr.Index.Build(async builder => + { + builder.ReferenceField = "id"; + + builder + .AddField("title") + .AddField("body", 10); + + foreach (Document doc in _documents) + { + await builder.Add(doc); + } + }); + + return idx.CopyToLmdb(_tempDir.NewDirectory()); + } + + public void Dispose() + { + _tempDir.Dispose(); + } + } +} diff --git a/LunrCoreLmdbTests/Globalization/it/ItalianSearchTests.cs b/LunrCoreLmdbTests/Globalization/it/ItalianSearchTests.cs new file mode 100644 index 0000000..6133761 --- /dev/null +++ b/LunrCoreLmdbTests/Globalization/it/ItalianSearchTests.cs @@ -0,0 +1,72 @@ +using System; +using System.Collections.Generic; +using System.Threading.Tasks; +using Lunr; +using LunrCoreLmdb; +using Xunit; + +namespace LunrCoreLmdbTests.Globalization.it +{ + [Collection(nameof(TempDirectory))] + public class ItalianSearchTests : IDisposable + { + private readonly TempDirectory _tempDir; + + public ItalianSearchTests(TempDirectory tempDir) + { + _tempDir = tempDir; + } + + private readonly Document[] _documents = + { + new Document + { + { "id", "a" }, + { "title", "Italia" }, + { "body", "L'Italia (/iˈtalja/[9], ascolta[?·info]), ufficialmente Repubblica Italiana,[10] è una repubblica parlamentare situata nell'Europa meridionale, con una popolazione di 60,6 milioni di abitanti e Roma come capitale. Delimitata dall'arco alpino, confina a nord, da ovest a est, con Francia, Svizzera, Austria e Slovenia; il resto del territorio, circondato dai mari Ligure, Tirreno, Ionio e Adriatico, si protende nel mar Mediterraneo, occupando la penisola italiana e numerose isole (le maggiori sono Sicilia e Sardegna), per un totale di 301 340 km²[11]. Gli Stati della Città del Vaticano e di San Marino sono enclavi della Repubblica." } + }, + new Document + { + { "id", "b" }, + { "title", "Suddivisioni amministrative" }, + { "body", "Gli enti territoriali che, in base all'articolo 114 della Costituzione costituiscono, assieme allo Stato, la Repubblica italiana sono: le regioni (15 a statuto ordinario e 5 a statuto speciale); le città metropolitane (14); le province e i comuni (rispettivamente 93 e 7 999, dati ISTAT dell'anno 2016).[121] Nell'elenco che segue, per ciascuna regione è riportato lo stemma ufficiale e il nome del capoluogo. pronunziato" } + }, + }; + + [Theory] + [InlineData("Italia*", 2)] + [InlineData("assieme", 1)] + [InlineData("pronunziarle", 1)] + [InlineData("inexistent", 0)] + public async Task FindTheWord(string word, int resultCount) + { + using var idx = await GetPlainIndex(); + IList results = await idx.Search(word).ToList(); + Assert.Equal(resultCount, results.Count); + } + + private async Task GetPlainIndex() + { + var idx = await Lunr.Globalization.it.Index.Build(async builder => + { + builder.ReferenceField = "id"; + + builder + .AddField("title") + .AddField("body", 10); + + foreach (Document doc in _documents) + { + await builder.Add(doc); + } + }); + + return idx.CopyToLmdb(_tempDir.NewDirectory()); + } + + public void Dispose() + { + _tempDir.Dispose(); + } + } +} \ No newline at end of file diff --git a/LunrCoreLmdbTests/LmdbIndexExtensions.cs b/LunrCoreLmdbTests/LmdbIndexExtensions.cs new file mode 100644 index 0000000..3de54c7 --- /dev/null +++ b/LunrCoreLmdbTests/LmdbIndexExtensions.cs @@ -0,0 +1,27 @@ +using Lunr; +using LunrCoreLmdb; +using Xunit; + +namespace LunrCoreLmdbTests +{ + public static class IndexExtensions + { + public static DelegatedIndex CopyToLmdb(this Index index, string path) + { + var lmdb = new LmdbIndex(path); + + foreach (var field in index.Fields) + Assert.True(lmdb.AddField(field)); + + foreach (var (k, v) in index.FieldVectors) + Assert.True(lmdb.AddFieldVector(k, v)); + + foreach (var (k, v) in index.InvertedIndex) + Assert.True(lmdb.AddInvertedIndexEntry(k, v)); + + var idx = new DelegatedIndex(lmdb, index.Pipeline); + + return idx; + } + } +} \ No newline at end of file diff --git a/LunrCoreLmdbTests/SearchTests.cs b/LunrCoreLmdbTests/SearchTests.cs index e8fa7bd..b28313c 100644 --- a/LunrCoreLmdbTests/SearchTests.cs +++ b/LunrCoreLmdbTests/SearchTests.cs @@ -535,7 +535,7 @@ private async Task GetPlainIndex() } }); - return CopyIndex(idx); + return idx.CopyToLmdb(_tempDir.NewDirectory()); } private async Task GetIndexWithDocumentBoost() @@ -559,29 +559,9 @@ private async Task GetIndexWithDocumentBoost() } }); - return CopyIndex(idx); + return idx.CopyToLmdb(_tempDir.NewDirectory()); } - private DelegatedIndex CopyIndex(Lunr.Index index) - { - var path = _tempDir.NewDirectory(); - - var lmdb = new LmdbIndex(path); - - foreach (var field in index.Fields) - Assert.True(lmdb.AddField(field)); - - foreach (var (k, v) in index.FieldVectors) - Assert.True(lmdb.AddFieldVector(k, v)); - - foreach (var (k, v) in index.InvertedIndex) - Assert.True(lmdb.AddInvertedIndexEntry(k, v)); - - var idx = new DelegatedIndex(lmdb, index.Pipeline); - - return idx; - } - public void Dispose() { _tempDir.Dispose(); diff --git a/LunrCoreTests/Globalization/AmongTests.cs b/LunrCoreTests/Globalization/AmongTests.cs new file mode 100644 index 0000000..f7c4ac4 --- /dev/null +++ b/LunrCoreTests/Globalization/AmongTests.cs @@ -0,0 +1,29 @@ +using System; +using Lunr.Globalization; +using Xunit; + +namespace LunrCoreTests.Globalization +{ + public class AmongTests + { + [Theory] + [InlineData("col", -1, -1)] + [InlineData("a", -1, 3)] + [InlineData("eriez", 33, 2)] + [InlineData("", -1, 4)] + public void Can_construct_valid_data(string s, int substring_i, int result) + { + var one = new Among(s, substring_i, result); + var two = new Among(s, substring_i, result); + + Assert.True(one.Equals(two)); + } + + [Theory] + [InlineData(default, -1, -1)] + public void Throws_on_invalid_data(string s, int substring_i, int result) + { + Assert.Throws(() => new Among(s, substring_i, result)); + } + } +} diff --git a/LunrCoreTests/Globalization/de/GermanSearchTests.cs b/LunrCoreTests/Globalization/de/GermanSearchTests.cs new file mode 100644 index 0000000..4f3c586 --- /dev/null +++ b/LunrCoreTests/Globalization/de/GermanSearchTests.cs @@ -0,0 +1,56 @@ +using System.Collections.Generic; +using System.Threading.Tasks; +using Lunr; +using Xunit; +using Index = Lunr.Index; + +namespace LunrCoreTests.Globalization.de +{ + public class GermanSearchTests + { + private readonly Document[] _documents = new[] + { + new Document + { + { "id", "1" }, + { "title", "Deutschland" }, + { "body", "An Deutschland grenzen neun Nachbarländer und naturräumlich im Norden die Gewässer der Nord- und Ostsee, im Süden das Bergland der Alpen. Es liegt in der gemäßigten Klimazone, zählt mit rund 80 Millionen Einwohnern zu den dicht besiedelten Flächenstaaten und gilt international als das Land mit der dritthöchsten Zahl von Einwanderern. aufeinanderfolgenden. auffassen." } + }, + new Document + { + { "id", "2" }, + { "title", "Tourismus in Deutschland" }, + { "body", "Deutschland als Urlaubsziel verfügt über günstige Voraussetzungen: Gebirgslandschaften (Alpen und Mittelgebirge), See- und Flusslandschaften, die Küsten und Inseln der Nord- und Ostsee, zahlreiche Kulturdenkmäler und eine Vielzahl geschichtsträchtiger Städte sowie gut ausgebaute Infrastruktur. Vorteilhaft ist die zentrale Lage in Europa." } + }, + }; + + [Theory(Skip = "There is a bug in this stemmer")] + [InlineData("Deutsch*", 2)] + [InlineData("Urlaubsziel*", 1)] + [InlineData("auffassung", 1)] + [InlineData("inexistent", 0)] + public async Task FindTheWord(string word, int resultCount) + { + Index idx = await GetPlainIndex(); + IList results = await idx.Search(word).ToList(); + Assert.Equal(resultCount, results.Count); + } + + private async Task GetPlainIndex() + { + return await Lunr.Globalization.de.Index.Build(async builder => + { + builder.ReferenceField = "id"; + + builder + .AddField("title") + .AddField("body", boost: 10); + + foreach (Document doc in _documents) + { + await builder.Add(doc); + } + }); + } + } +} diff --git a/LunrCoreTests/Globalization/de/GermanStemmerTests.cs b/LunrCoreTests/Globalization/de/GermanStemmerTests.cs new file mode 100644 index 0000000..1df64ab --- /dev/null +++ b/LunrCoreTests/Globalization/de/GermanStemmerTests.cs @@ -0,0 +1,16 @@ +using Lunr.Globalization.de; +using Xunit; + +namespace LunrCoreTests.Globalization.de +{ + public class GermanStemmerTests + { + [Theory(Skip = "There is a bug in this stemmer")] + [InlineData("auffassen", "auffass")] + [InlineData("auffassung", "auffass")] // habr1 converts this to auffassUng, which then sieves out + public void Stems_standard_suffixes(string word, string stemmed) + { + Assert.Equal(stemmed, new GermanStemmer().Stem(word)); + } + } +} \ No newline at end of file diff --git a/LunrCoreTests/Globalization/fr/FrenchSearchTests.cs b/LunrCoreTests/Globalization/fr/FrenchSearchTests.cs new file mode 100644 index 0000000..cd66867 --- /dev/null +++ b/LunrCoreTests/Globalization/fr/FrenchSearchTests.cs @@ -0,0 +1,56 @@ +using System.Collections.Generic; +using System.Threading.Tasks; +using Lunr; +using Xunit; +using Index = Lunr.Index; + +namespace LunrCoreTests.Globalization.fr +{ + public class FrenchSearchTests + { + private readonly Document[] _documents = new[] + { + new Document + { + { "id", "1" }, + { "title", "France" }, + { "body", "La France Prononciation du titre dans sa version originale Écouter, officiellement la République française Prononciation du titre dans sa version originale Écouter, est un État transcontinental souverain, dont le territoire métropolitain est situé en Europe de l'Ouest. Ce dernier a des frontières terrestres avec la Belgique, le Luxembourg, l'Allemagne, la Suisse, l'Italie, l'Espagne et les principautés d'Andorre et de MonacoN 6,6 et dispose d'importantes façades maritimes dans l'Atlantique, la Manche, la mer du Nord et la Méditerranée. Son territoire ultramarin s'étend dans les océans Indien7, Atlantique8 et Pacifique9 ainsi que sur le continent sud-américain10 et a des frontières terrestres avec le Brésil, le Suriname et le Royaume des Pays-Bas." } + }, + new Document + { + { "id", "2" }, + { "title", "Politique et administration" }, + { "body", "La France est une démocratie libérale, dont le gouvernement a la forme d’une république. Les fondements de l’organisation politique et administrative actuelle de la France ont été fixés en 1958 par la Constitution de la Cinquième République. Selon l’article premier de cette constitution, « la France est une République indivisible, laïque, démocratique et sociale ». Depuis 2003, ce même article affirme en outre que « son organisation est décentralisée continuelle" } + }, + }; + + [Theory] + [InlineData("France", 2)] + [InlineData("gouvernement", 1)] + [InlineData("continuellement", 1)] + [InlineData("inexistent", 0)] + public async Task FindTheWord(string word, int resultCount) + { + Index idx = await GetPlainIndex(); + IList results = await idx.Search(word).ToList(); + Assert.Equal(resultCount, results.Count); + } + + private async Task GetPlainIndex() + { + return await Lunr.Globalization.fr.Index.Build(async builder => + { + builder.ReferenceField = "id"; + + builder + .AddField("title") + .AddField("body", boost: 10); + + foreach (Document doc in _documents) + { + await builder.Add(doc); + } + }); + } + } +} diff --git a/LunrCoreTests/Globalization/fr/FrenchStopWordFilterTests.cs b/LunrCoreTests/Globalization/fr/FrenchStopWordFilterTests.cs new file mode 100644 index 0000000..a70a454 --- /dev/null +++ b/LunrCoreTests/Globalization/fr/FrenchStopWordFilterTests.cs @@ -0,0 +1,36 @@ +using System.Threading.Tasks; +using Lunr; +using Lunr.Globalization.fr; +using Xunit; + +namespace LunrCoreTests.Globalization.fr +{ + public class FrenchStopWordFilterTests + { + private readonly StopWordFilterBase filter = new FrenchStopWordFilter(); + + [Fact] + public async Task StopWordFilterFiltersStopWords() + { + string[] stopWords = new[] { "aurai", "elle", "leurs", "soyons", "êtes" }; + + foreach (string word in stopWords) + { + Assert.True(filter.IsStopWord(word)); + Assert.Empty(await filter.FilterFunction.BasicallyRun(word)); + } + } + + [Fact] + public async Task StopWordFilterIgnoresNonStopWords() + { + string[] nonStopWords = new[] { "baleine", "bisou", "brindille", "câlin" }; + + foreach (string word in nonStopWords) + { + Assert.False(filter.IsStopWord(word)); + Assert.Equal(new[] { word }, await filter.FilterFunction.BasicallyRun(word)); + } + } + } +} diff --git a/LunrCoreTests/Globalization/fr/FrenchTrimmerTests.cs b/LunrCoreTests/Globalization/fr/FrenchTrimmerTests.cs new file mode 100644 index 0000000..af91a9f --- /dev/null +++ b/LunrCoreTests/Globalization/fr/FrenchTrimmerTests.cs @@ -0,0 +1,21 @@ +using Lunr.Globalization.fr; +using Xunit; + +namespace LunrCoreTests.Globalization.fr +{ + public class FrenchTrimmerTests + { + [Theory] + [InlineData("d’Amritsar", "d’Amritsar")] // word + [InlineData("français.", "français")] // full stop + [InlineData("l'accès", "l'accès")] // inner apostrophe + [InlineData("Chloë'", "Chloë")] // trailing apostrophe + [InlineData("C’est!'", "C’est")] // exclamation mark + [InlineData("L’âge,'", "L’âge")] // comma + [InlineData("[nationalité]'", "nationalité")] // brackets + public void CheckTrim(string str, string expected) + { + Assert.Equal(expected, new FrenchTrimmer().Trim(str)); + } + } +} \ No newline at end of file diff --git a/LunrCoreTests/Globalization/it/ItalianSearchTests.cs b/LunrCoreTests/Globalization/it/ItalianSearchTests.cs new file mode 100644 index 0000000..ad5b744 --- /dev/null +++ b/LunrCoreTests/Globalization/it/ItalianSearchTests.cs @@ -0,0 +1,55 @@ +using System.Collections.Generic; +using System.Threading.Tasks; +using Lunr; +using Xunit; + +namespace LunrCoreTests.Globalization.it +{ + public class ItalianSearchTests + { + private readonly Document[] _documents = + { + new Document + { + { "id", "1" }, + { "title", "Italia" }, + { "body", "L'Italia (/iˈtalja/[9], ascolta[?·info]), ufficialmente Repubblica Italiana,[10] è una repubblica parlamentare situata nell'Europa meridionale, con una popolazione di 60,6 milioni di abitanti e Roma come capitale. Delimitata dall'arco alpino, confina a nord, da ovest a est, con Francia, Svizzera, Austria e Slovenia; il resto del territorio, circondato dai mari Ligure, Tirreno, Ionio e Adriatico, si protende nel mar Mediterraneo, occupando la penisola italiana e numerose isole (le maggiori sono Sicilia e Sardegna), per un totale di 301 340 km²[11]. Gli Stati della Città del Vaticano e di San Marino sono enclavi della Repubblica." } + }, + new Document + { + { "id", "2" }, + { "title", "Suddivisioni amministrative" }, + { "body", "Gli enti territoriali che, in base all'articolo 114 della Costituzione costituiscono, assieme allo Stato, la Repubblica italiana sono: le regioni (15 a statuto ordinario e 5 a statuto speciale); le città metropolitane (14); le province e i comuni (rispettivamente 93 e 7 999, dati ISTAT dell'anno 2016).[121] Nell'elenco che segue, per ciascuna regione è riportato lo stemma ufficiale e il nome del capoluogo. pronunziato" } + }, + }; + + [Theory] + [InlineData("Italia*", 2)] + [InlineData("assieme", 1)] + [InlineData("pronunziarle", 1)] + [InlineData("inexistent", 0)] + public async Task FindTheWord(string word, int resultCount) + { + Index idx = await GetPlainIndex(); + IList results = await idx.Search(word).ToList(); + Assert.Equal(resultCount, results.Count); + } + + private async Task GetPlainIndex() + { + return await Lunr.Globalization.it.Index.Build(async builder => + { + builder.ReferenceField = "id"; + + builder + .AddField("title") + .AddField("body", boost: 10); + + foreach (Document doc in _documents) + { + await builder.Add(doc); + } + }); + } + } +} \ No newline at end of file diff --git a/README.md b/README.md index 53cce88..1c0adfb 100644 --- a/README.md +++ b/README.md @@ -91,4 +91,5 @@ See the [`CONTRIBUTING.md` file](CONTRIBUTING.md). * Original code by [Oliver Nightingale](https://github.com/olivernn) and contributors, ported to .NET Core by [Bertrand Le Roy](https://github.com/bleroy). * Icon adapted from https://commons.wikimedia.org/wiki/File:Internal_Structure_of_the_Moon.JPG by Iqbal Mahmud under Creative Commons Attribution Share Alike 4.0 International. -* Perf tests use a [word list by Sindre Sorhus](https://github.com/sindresorhus/word-list). \ No newline at end of file +* Perf tests use a [word list by Sindre Sorhus](https://github.com/sindresorhus/word-list). +* Globalization ported from [Mihai Valentin's Lunr Languages Project](https://github.com/MihaiValentin/lunr-languages). \ No newline at end of file