Skip to content

Commit

Permalink
Use terms localizations
Browse files Browse the repository at this point in the history
  • Loading branch information
Enkidu93 committed Aug 15, 2024
1 parent e97df8c commit 056c9fd
Show file tree
Hide file tree
Showing 10 changed files with 96,806 additions and 52 deletions.
34,811 changes: 34,811 additions & 0 deletions src/SIL.Machine/Corpora/BiblicalTermsEn.xml

Large diffs are not rendered by default.

9,434 changes: 9,434 additions & 0 deletions src/SIL.Machine/Corpora/BiblicalTermsEs.xml

Large diffs are not rendered by default.

10,757 changes: 10,757 additions & 0 deletions src/SIL.Machine/Corpora/BiblicalTermsFr.xml

Large diffs are not rendered by default.

16,835 changes: 16,835 additions & 0 deletions src/SIL.Machine/Corpora/BiblicalTermsId.xml

Large diffs are not rendered by default.

24,842 changes: 24,842 additions & 0 deletions src/SIL.Machine/Corpora/BiblicalTermsPt.xml

Large diffs are not rendered by default.

144 changes: 101 additions & 43 deletions src/SIL.Machine/Corpora/ParatextBackupTermsCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,64 @@ public class ParatextBackupTermsCorpus : DictionaryTextCorpus
"Pt6"
};

public ParatextBackupTermsCorpus(string fileName, IEnumerable<string> termCategories)
private static readonly Dictionary<string, string> SupportedLanguageTermsLocalizationXmls = new Dictionary<
string,
string
>()
{
{ "en", "SIL.Machine.Corpora.BiblicalTermsEn.xml" },
{ "es", "SIL.Machine.Corpora.BiblicalTermsEs.xml" },
{ "fr", "SIL.Machine.Corpora.BiblicalTermsFr.xml" },
{ "id", "SIL.Machine.Corpora.BiblicalTermsId.xml" },
{ "pt", "SIL.Machine.Corpora.BiblicalTermsPt.xml" }
};

public ParatextBackupTermsCorpus(
string fileName,
IEnumerable<string> termCategories,
string languageCode = null,
bool preferTermsLocalization = false
)
{
using (var archive = ZipFile.OpenRead(fileName))
{
ZipArchiveEntry termsFileEntry = archive.GetEntry("TermRenderings.xml");
if (termsFileEntry is null)
return;
XDocument doc;
bool useTermsRenderingXml = !preferTermsLocalization && termsFileEntry != null;

var settingsParser = new ZipParatextProjectSettingsParser(archive);
ParatextProjectSettings settings = settingsParser.Parse();
if (!SupportedLanguageTermsLocalizationXmls.TryGetValue(languageCode, out string resourceName))
{
if (termsFileEntry != null)
{
useTermsRenderingXml = true;
}
else
{
return;
}
}

XDocument termRenderingsDoc;
using (Stream keyTermsFile = termsFileEntry.Open())
if (useTermsRenderingXml)
{
termRenderingsDoc = XDocument.Load(keyTermsFile);
using (Stream keyTermsFile = termsFileEntry.Open())
{
doc = XDocument.Load(keyTermsFile);
}
}
else
{
using (
Stream keyTermsFile = Assembly.GetExecutingAssembly().GetManifestResourceStream(resourceName)
)
{
doc = XDocument.Load(keyTermsFile);
}
}

var settingsParser = new ZipParatextProjectSettingsParser(archive);
ParatextProjectSettings settings = settingsParser.Parse();

//Align TermRenderings and BiblicalTerms
ZipArchiveEntry biblicalTermsFileEntry = archive.GetEntry(settings.BiblicalTermsFileName);

XDocument biblicalTermsDoc;
Expand Down Expand Up @@ -80,54 +121,71 @@ public ParatextBackupTermsCorpus(string fileName, IEnumerable<string> termCatego
{
termIdToCategoryDictionary = new Dictionary<string, string>();
}
AddTexts(doc, settings, termCategories, termIdToCategoryDictionary);
}
}

IEnumerable<XElement> termsElements = termRenderingsDoc
.Descendants()
.Where(n => n.Name.LocalName == "TermRendering");
public void AddTexts(
XDocument doc,
ParatextProjectSettings settings,
IEnumerable<string> termCategories,
IDictionary<string, string> termIdToCategoryDictionary
)
{
IEnumerable<XElement> termsElements = doc.Descendants().Where(n => n.Name.LocalName == "TermRendering");
bool isTermRenderingsFile = true;
if (termsElements.Count() == 0)
{
termsElements = doc.Descendants().Where(n => n.Name.LocalName == "Localization");
isTermRenderingsFile = false;
}

string textId =
$"{settings.BiblicalTermsListType}:{settings.BiblicalTermsProjectName}:{settings.BiblicalTermsFileName}";
List<TextRow> rows = new List<TextRow>();
foreach (XElement element in termsElements)
string textId =
$"{settings.BiblicalTermsListType}:{settings.BiblicalTermsProjectName}:{settings.BiblicalTermsFileName}";
List<TextRow> rows = new List<TextRow>();
foreach (XElement element in termsElements)
{
string id = element.Attribute("Id").Value;
string category = "";
if (
(termCategories.Count() > 0 && !termIdToCategoryDictionary.TryGetValue(id, out category))
|| (termCategories.Count() > 0 && !termCategories.Contains(category))
)
{
string id = element.Attribute("Id").Value;
string category = "";
if (
(termCategories.Count() > 0 && !termIdToCategoryDictionary.TryGetValue(id, out category))
|| (termCategories.Count() > 0 && !termCategories.Contains(category))
)
{
continue;
}
id = id.Replace("\n", "&#xA");
string rendering = element.Element("Renderings").Value;
IReadOnlyList<string> renderings = GetRenderings(rendering);
rows.Add(new TextRow(textId, id) { Segment = renderings });
continue;
}
IText text = new MemoryText(textId, rows);
AddText(text);
id = id.Replace("\n", "&#xA");
string gloss = isTermRenderingsFile
? element.Element("Renderings").Value
: element.Attribute("Gloss").Value;
IReadOnlyList<string> glosses = GetGlosses(gloss);
rows.Add(new TextRow(textId, id) { Segment = glosses });
}
IText text = new MemoryText(textId, rows);
AddText(text);
}

public static IReadOnlyList<string> GetRenderings(string rendering)
public static IReadOnlyList<string> GetGlosses(string gloss)
{
//If entire term rendering is surrounded in square brackets, remove them
Regex rx = new Regex(@"^\[(.+?)\]$", RegexOptions.Compiled);
Match match = rx.Match(rendering);
if (match.Success)
rendering = match.Groups[0].Value;
rendering = rendering.Replace("?", "");
rendering = rendering.Replace("*", "");
rendering = rendering.Replace("/", " ");
rendering = rendering.Trim();
rendering = StripParens(rendering);
rendering = StripParens(rendering, left: '[', right: ']');
Match rx_match = rx.Match(gloss);
if (rx_match.Success)
gloss = rx_match.Groups[0].Value;
gloss = gloss.Replace("?", "");
gloss = gloss.Replace("*", "");
gloss = gloss.Replace("/", " ");
gloss = gloss.Trim();
gloss = StripParens(gloss);
gloss = StripParens(gloss, left: '[', right: ']');
// gloss = gloss.Trim();
Regex rx2 = new Regex(@"\s+\d+(\.\d+)*$", RegexOptions.Compiled);
foreach (Match m in rx2.Matches(rendering))
foreach (Match m in rx2.Matches(gloss))
{
rendering.Replace(m.Value, "");
gloss.Replace(m.Value, "");
}
IEnumerable<string> glosses = Regex.Split(rendering, @"\|\|");
IEnumerable<string> glosses = Regex.Split(gloss, @"\|\|");
glosses = glosses.SelectMany(g => g.Split(new char[] { ',', ';' }));
glosses = glosses.Select(g => g.Trim()).Where(s => s != "").Distinct().ToList();
return (IReadOnlyList<string>)glosses;
}
Expand Down
6 changes: 6 additions & 0 deletions src/SIL.Machine/SIL.Machine.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@
<EmbeddedResource Include="Corpora\BiblicalTerms.xml" />
<EmbeddedResource Include="Corpora\BiblicalTermsP6NT.xml" />
<EmbeddedResource Include="Corpora\BiblicalTermsSILNT.xml" />
<EmbeddedResource Include="Corpora\BiblicalTermsEn.xml" />
<EmbeddedResource Include="Corpora\BiblicalTermsEs.xml" />
<EmbeddedResource Include="Corpora\BiblicalTermsFr.xml" />
<EmbeddedResource Include="Corpora\BiblicalTermsId.xml" />
<EmbeddedResource Include="Corpora\BiblicalTermsPt.xml" />

</ItemGroup>

<ItemGroup>
Expand Down
21 changes: 16 additions & 5 deletions tests/SIL.Machine.Tests/Corpora/ParatextBackupTermsCorpusTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,21 @@ namespace SIL.Machine.Corpora;
public class ParatextKeyTermsCorpusTests
{
[Test]
public void TestGetKeyTerms()
public void TestGetKeyTermsFromTermsRenderings()
{
using var env = new TestEnvironment();
IList<TextRow> rows = env.Corpus.GetRows().ToList();
Assert.That(rows.Count, Is.EqualTo(1));
Assert.That(string.Join(" ", rows.First().Segment), Is.EqualTo("Abba"));
Assert.That(string.Join(" ", rows.First().Segment), Is.EqualTo("Xerxes"));
}

[Test]
public void TestGetKeyTermsFromTermsLocalizations()
{
using var env = new TestEnvironment(preferTermsLocalization: true);
IList<TextRow> rows = env.Corpus.GetRows().ToList();
Assert.That(rows.Count, Is.EqualTo(1));
Assert.That(string.Join(" ", rows.First().Segment), Is.EqualTo("Ahasuerus Xerxes"));
}

[Test]
Expand All @@ -35,19 +44,21 @@ public void TestStripParens(string testString, string expectedOutput, char left
[TestCase("Abba|| ", new string[] { "Abba" })]
[TestCase("Abba||Abbah?", new string[] { "Abba", "Abbah" })]
[TestCase("Abba (note)", new string[] { "Abba" })]
[TestCase("Abba (note)", new string[] { "Abba" })]
[TestCase("Ahasuerus, Xerxes; Assuerus", new string[] { "Ahasuerus", "Xerxes", "Assuerus" })]
public void TestGetGlosses(string glossString, IReadOnlyList<string> expectedOutput)
{
Assert.That(ParatextBackupTermsCorpus.GetRenderings(glossString), Is.EqualTo(expectedOutput));
Assert.That(ParatextBackupTermsCorpus.GetGlosses(glossString), Is.EqualTo(expectedOutput));
}

private class TestEnvironment : DisposableBase
{
private readonly string _backupPath;

public TestEnvironment()
public TestEnvironment(bool preferTermsLocalization = false)
{
_backupPath = CorporaTestHelpers.CreateTestParatextBackup();
Corpus = new ParatextBackupTermsCorpus(_backupPath, new string[] { "PN" });
Corpus = new ParatextBackupTermsCorpus(_backupPath, new string[] { "PN" }, "en", preferTermsLocalization);
}

public ParatextBackupTermsCorpus Corpus { get; }
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<BiblicalTermsList>
<Term Id="Abba">
<Term Id="אֲחַשְׁוֵרוֹשׁ">
<Category>PN</Category>
<Gloss>Abba</Gloss>
<Gloss>Ahasuerus</Gloss>
</Term>
</BiblicalTermsList>
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<TermRenderingsList>
<TermRendering Id="Abba" Guess="false">
<Renderings>Abba</Renderings>
<TermRendering Id="אֲחַשְׁוֵרוֹשׁ" Guess="false">
<Renderings>Xerxes</Renderings>
<Glossary />
<Changes />
<Notes />
Expand Down

0 comments on commit 056c9fd

Please sign in to comment.