-
-
Notifications
You must be signed in to change notification settings - Fork 24
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Proposal: Adding additional languages support #27
base: main
Are you sure you want to change the base?
Changes from 17 commits
b13a222
a3e65eb
2632d56
22d8cd6
9938467
f49dba2
008509c
f688881
6f0fe59
29faf11
e5eff94
e382b1e
2b52c5e
2c05a22
1e18c81
f0428d0
1d8e0ff
d22a2b6
7bb449b
6117609
31210b7
069ed0d
dd215b6
b0f360c
e2dcd7c
1ab7162
d92b9a4
c69bba5
8162a92
07551ea
7361656
87909d2
896812d
f30515f
c086b31
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
// Ported from: https://github.com/MihaiValentin/lunr-languages/blob/master/LICENSE | ||
|
||
using System; | ||
using System.Linq; | ||
|
||
namespace Lunr.Globalization | ||
{ | ||
public readonly struct Among : IEquatable<Among> | ||
{ | ||
public readonly int s_size; | ||
public readonly char[] s; | ||
public readonly int result; | ||
public readonly Func<bool>? method; | ||
public readonly int substring_i; | ||
|
||
public Among(string s, int substring_i, int result, Func<bool> method = default!) | ||
{ | ||
if (s == null) | ||
throw new ArgumentException($"Bad Among initialization: s:{s}, substring_i: {substring_i}, result: {result}"); | ||
this.s_size = s.Length; | ||
this.s = s.ToCharArray(); | ||
this.substring_i = substring_i; | ||
this.result = result; | ||
this.method = method; | ||
} | ||
|
||
public bool Equals(Among other) => | ||
s_size.Equals(other.s_size) && | ||
s.SequenceEqual(other.s) && | ||
result == other.result && | ||
method == other.method && | ||
substring_i == other.substring_i; | ||
|
||
public override bool Equals(object? obj) => obj is Among other && Equals(other); | ||
|
||
public override int GetHashCode() | ||
{ | ||
unchecked | ||
{ | ||
var hashCode = s_size.GetHashCode(); | ||
hashCode = (hashCode * 397) ^ s.GetHashCode(); | ||
hashCode = (hashCode * 397) ^ result; | ||
hashCode = (hashCode * 397) ^ (method != null ? method.GetHashCode() : 0); | ||
hashCode = (hashCode * 397) ^ substring_i; | ||
return hashCode; | ||
danielcrenna marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
} | ||
|
||
public static bool operator ==(Among left, Among right) => left.Equals(right); | ||
|
||
public static bool operator !=(Among left, Among right) => !left.Equals(right); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,299 @@ | ||
// Ported from: https://github.com/MihaiValentin/lunr-languages/blob/master/LICENSE | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That it is a port from JS shows in that it looks like the operations on current could be way more efficiently done through a |
||
|
||
using System; | ||
|
||
namespace Lunr.Globalization | ||
{ | ||
internal sealed class SnowballProgram | ||
{ | ||
private string current; | ||
|
||
internal int cursor; | ||
internal int limit; | ||
internal int limit_backward; | ||
internal int bra; | ||
internal int ket; | ||
|
||
public void SetCurrent(string word) | ||
{ | ||
current = word; | ||
cursor = 0; | ||
limit = word.Length; | ||
limit_backward = 0; | ||
bra = cursor; | ||
ket = limit; | ||
} | ||
|
||
public string GetCurrent() | ||
{ | ||
var result = current; | ||
current = null!; | ||
return result; | ||
} | ||
|
||
public bool in_grouping(int[] s, int min, int max) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's make those method names idiomatic. |
||
if (cursor < limit) { | ||
var ch = (int) current[cursor]; | ||
if (ch <= max && ch >= min) { | ||
ch -= min; | ||
int r = s[ch >> 3] & (0X1 << (ch & 0X7)); | ||
if (r == 0) { | ||
cursor++; | ||
return true; | ||
} | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
public bool in_grouping_b(int[] s, int min, int max) { | ||
if (this.cursor > this.limit_backward) { | ||
|
||
var ch = (int) current[this.cursor - 1]; | ||
if (ch <= max && ch >= min) { | ||
ch -= min; | ||
var r = s[ch >> 3] & (0X1 << (ch & 0X7)); | ||
if (r == 0) { | ||
this.cursor--; | ||
return true; | ||
} | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
public bool out_grouping(int[] s, int min, int max) | ||
{ | ||
if (this.cursor < this.limit) { | ||
var ch = (int) current[this.cursor]; | ||
if (ch > max || ch < min) { | ||
this.cursor++; | ||
return true; | ||
} | ||
ch -= min; | ||
var r = (s[ch >> 3] & (0X1 << (ch & 0X7))); | ||
if (r != 0) { | ||
this.cursor++; | ||
return true; | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
public bool out_grouping_b(int[] s, int min, int max) { | ||
if (this.cursor > this.limit_backward) { | ||
var ch = (int) current[this.cursor - 1]; | ||
if (ch > max || ch < min) { | ||
this.cursor--; | ||
return true; | ||
} | ||
ch -= min; | ||
var r = (s[ch >> 3] & (0X1 << (ch & 0X7))); | ||
if (r != 0) { | ||
this.cursor--; | ||
return true; | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
public bool eq_s(int s_size, string s) { | ||
if (this.limit - this.cursor < s_size) | ||
return false; | ||
for (var i = 0; i < s_size; i++) | ||
if (current[this.cursor + i] != s[i]) | ||
return false; | ||
bleroy marked this conversation as resolved.
Show resolved
Hide resolved
|
||
this.cursor += s_size; | ||
return true; | ||
} | ||
|
||
public bool eq_s_b(int s_size, string s) { | ||
if (this.cursor - this.limit_backward < s_size) | ||
return false; | ||
for (var i = 0; i < s_size; i++) | ||
if (current[this.cursor - s_size + i] != s[i]) | ||
return false; | ||
this.cursor -= s_size; | ||
return true; | ||
} | ||
|
||
public int find_among(Among[] v, int v_size) | ||
{ | ||
var i = 0; | ||
var j = v_size; | ||
var c = cursor; | ||
var l = limit; | ||
var common_i = 0; | ||
var common_j = 0; | ||
var first_key_inspected = false; | ||
|
||
while (true) | ||
{ | ||
var k = i + ((j - i) >> 1); | ||
var diff = 0; | ||
var common = common_i < common_j ? common_i : common_j; | ||
var w = v[k]; | ||
|
||
for (var i2 = common; i2 < w.s_size; i2++) | ||
{ | ||
if (c + common == l) | ||
{ | ||
diff = -1; | ||
break; | ||
} | ||
diff = current[c + common] - w.s[i2]; | ||
if (diff != 0) /* !! */ | ||
break; | ||
common++; | ||
} | ||
if (diff < 0) | ||
{ | ||
j = k; | ||
common_j = common; | ||
} | ||
else | ||
{ | ||
i = k; | ||
common_i = common; | ||
} | ||
if (j - i <= 1) | ||
{ | ||
if (i > 0 || j == i || first_key_inspected) | ||
break; | ||
first_key_inspected = true; | ||
} | ||
} | ||
while (true) | ||
{ | ||
var w = v[i]; | ||
if (common_i >= w.s_size) | ||
{ | ||
cursor = c + w.s_size; | ||
if (w.method == null) | ||
return w.result; | ||
var res = w.method(); | ||
cursor = c + w.s_size; | ||
if (res) | ||
return w.result; | ||
} | ||
i = w.substring_i; | ||
if (i < 0) | ||
return 0; | ||
} | ||
} | ||
|
||
public int find_among_b(Among[] v, int v_size) | ||
{ | ||
var i = 0; | ||
var j = v_size; | ||
var c = cursor; | ||
var lb = this.limit_backward; | ||
var common_i = 0; | ||
var common_j = 0; | ||
var first_key_inspected = false; | ||
|
||
while (true) | ||
{ | ||
var k = i + ((j - i) >> 1); | ||
var diff = 0; | ||
var common = common_i < common_j ? common_i : common_j; | ||
var w = v[k]; | ||
|
||
for (var i2 = w.s_size - 1 - common; i2 >= 0; i2--) | ||
{ | ||
if (c - common == lb) | ||
{ | ||
diff = -1; | ||
break; | ||
} | ||
diff = current[(c - 1 - common)] - w.s[i2]; | ||
if (diff != 0) /* !! */ | ||
break; | ||
common++; | ||
} | ||
if (diff < 0) | ||
{ | ||
j = k; | ||
common_j = common; | ||
} | ||
else | ||
{ | ||
i = k; | ||
common_i = common; | ||
} | ||
if (j - i <= 1) | ||
{ | ||
if (i > 0 || j == i || first_key_inspected) | ||
break; | ||
first_key_inspected = true; | ||
} | ||
} | ||
while (true) | ||
{ | ||
var w = v[i]; | ||
if (common_i >= w.s_size) | ||
{ | ||
this.cursor = c - w.s_size; | ||
if (w.method == null) | ||
return w.result; | ||
var res = w.method(); | ||
this.cursor = c - w.s_size; | ||
if (res) | ||
return w.result; | ||
} | ||
i = w.substring_i; | ||
if (i < 0) | ||
return 0; | ||
} | ||
} | ||
|
||
public int replace_s(int c_bra, int c_ket, string s) | ||
{ | ||
var adjustment = s.Length - (c_ket - c_bra); | ||
var left = current.Substring(0, c_bra); | ||
var right = current.Substring(c_ket); | ||
|
||
current = left + s + right; | ||
limit += adjustment; | ||
if (cursor >= c_ket) | ||
cursor += adjustment; | ||
else if (cursor > c_bra) | ||
cursor = c_bra; | ||
return adjustment; | ||
} | ||
|
||
public void slice_check() | ||
{ | ||
if (bra < 0 || bra > ket || ket > limit|| limit > current?.Length) | ||
throw new InvalidOperationException("faulty slice operation"); | ||
} | ||
|
||
public void slice_from(string s) | ||
{ | ||
slice_check(); | ||
replace_s(bra, ket, s); | ||
} | ||
|
||
public void slice_del() | ||
{ | ||
slice_from(string.Empty); | ||
danielcrenna marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
public void insert(int c_bra, int c_ket, string s) | ||
{ | ||
var adjustment = replace_s(c_bra, c_ket, s); | ||
if (c_bra <= bra) | ||
bra += adjustment; | ||
if (c_bra <= ket) | ||
ket += adjustment; | ||
} | ||
|
||
public string? slice_to() | ||
{ | ||
slice_check(); | ||
return current?.Substring(bra, ket); | ||
} | ||
|
||
public bool eq_v_b(string s) => eq_s_b(s.Length, s); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
namespace Lunr.Globalization | ||
{ | ||
internal static class WordCharacters | ||
{ | ||
public const string Arabic = "، اض امين اه اها اي ا اب اجل اجمع اخ اخذ اصبح اضحى اقبل اقل اكثر الا ام اما امامك امامك امسى اما ان انا انت انتم انتما انتن انت انشا انى او اوشك اولئك اولئكم اولاء اولالك اوه اي ايا اين اينما اي ان اي اف اذ اذا اذا اذما اذن الى اليكم اليكما اليكن اليك اليك الا اما ان انما اي اياك اياكم اياكما اياكن ايانا اياه اياها اياهم اياهما اياهن اياي ايه ان ا ابتدا اثر اجل احد اخرى اخلولق اذا اربعة ارتد استحال اطار اعادة اعلنت اف اكثر اكد الالاء الالى الا الاخيرة الان الاول الاولى التى التي الثاني الثانية الذاتي الذى الذي الذين السابق الف اللائي اللاتي اللتان اللتيا اللتين اللذان اللذين اللواتي الماضي المقبل الوقت الى اليوم اما امام امس ان انبرى انقلب انه انها او اول اي ايار ايام ايضا ب بات باسم بان بخ برس بسبب بس بشكل بضع بطان بعد بعض بك بكم بكما بكن بل بلى بما بماذا بمن بن بنا به بها بي بيد بين بس بله بئس تان تانك تبدل تجاه تحول تلقاء تلك تلكم تلكما تم تينك تين ته تي ثلاثة ثم ثم ثمة ثم جعل جلل جميع جير حار حاشا حاليا حاي حتى حرى حسب حم حوالى حول حيث حيثما حين حي حبذا حتى حذار خلا خلال دون دونك ذا ذات ذاك ذانك ذان ذلك ذلكم ذلكما ذلكن ذو ذوا ذواتا ذواتي ذيت ذينك ذين ذه ذي راح رجع رويدك ريث رب زيارة سبحان سرعان سنة سنوات سوف سوى ساء ساءما شبه شخصا شرع شتان صار صباح صفر صه صه ضد ضمن طاق طالما طفق طق ظل عاد عام عاما عامة عدا عدة عدد عدم عسى عشر عشرة علق على عليك عليه عليها عل عن عند عندما عوض عين عدس عما غدا غير ف فان فلان فو فى في فيم فيما فيه فيها قال قام قبل قد قط قلما قوة كانما كاين كاي كاين كاد كان كانت كذا كذلك كرب كل كلا كلاهما كلتا كلم كليكما كليهما كلما كلا كم كما كي كيت كيف كيفما كان كخ لئن لا لات لاسيما لدن لدى لعمر لقاء لك لكم لكما لكن لكنما لكي لكيلا للامم لم لما لما لن لنا له لها لو لوكالة لولا لوما لي لست لست لستم لستما لستن لست لسن لعل لكن ليت ليس ليسا ليستا ليست ليسوا لسنا ما ماانفك مابرح مادام ماذا مازال مافتئ مايو متى مثل مذ مساء مع معاذ مقابل مكانكم مكانكما مكانكن مكانك مليار مليون مما ممن من منذ منها مه مهما من من نحن نحو نعم نفس نفسه نهاية نخ نعما نعم ها هاؤم هاك هاهنا هب هذا هذه هكذا هل هلم هلا هم هما هن هنا هناك هنالك هو هي هيا هيت هيا هؤلاء هاتان هاتين هاته هاتي هج هذا هذان هذين هذه هذي هيهات وا واحد واضاف واضافت واكد وان واها واوضح وراءك وفي وقال وقالت وقد وقف وكان وكانت ولا ولم ومن وهو وهي ويكان وي وشكان يكون يمكن يوم ايان"; | ||
public const string Danish = Latin; | ||
public const string Dutch = Latin; | ||
public const string Spanish = Latin; | ||
public const string Finnish = Latin; | ||
public const string French = Latin; | ||
public const string German = Latin; | ||
public const string Hungarian = Latin; | ||
public const string Italian = Latin; | ||
public const string Japanese = "一二三四五六七八九十百千万億兆一-龠々〆ヵヶぁ-んァ-ヴーア-ン゙a-zA-Za-zA-Z0-90-9"; | ||
public const string Norwegian = Latin; | ||
public const string Portuguese = Latin; | ||
public const string Romanian = Latin; | ||
public const string Russian = "\u0400-\u0484\u0487-\u052F\u1D2B\u1D78\u2DE0-\u2DFF\uA640-\uA69F\uFE2E\uFE2F"; | ||
public const string Swedish = Latin; | ||
public const string Thai = "[\u0e00-\u0e7f]"; | ||
public const string Turkish = Latin; | ||
public const string Vietnamese = "[A-Za-z\u0300\u0350\u0301\u0351\u0309\u0323\u0303\u0343\u00C2\u00E2\u00CA\u00EA\u00D4\u00F4\u0102-\u0103\u0110-\u0111\u01A0-\u01A1\u01AF-\u01B0]"; | ||
|
||
private const string Latin = "A-Za-z\xAA\xBA\xC0-\xD6\xD8-\xF6\xF8-\u02B8\u02E0-\u02E4\u1D00-\u1D25\u1D2C-\u1D5C\u1D62-\u1D65\u1D6B-\u1D77\u1D79-\u1DBE\u1E00-\u1EFF\u2071\u207F\u2090-\u209C\u212A\u212B\u2132\u214E\u2160-\u2188\u2C60-\u2C7F\uA722-\uA787\uA78B-\uA7AD\uA7B0-\uA7B7\uA7F7-\uA7FF\uAB30-\uAB5A\uAB5C-\uAB64\uFB00-\uFB06\uFF21-\uFF3A\uFF41-\uFF5A"; | ||
danielcrenna marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why use
Equals
here instead of==
, since this is just an int?