Skip to content

Commit

Permalink
Accommodate new default behavior; handle duplicate term ids gracefully
Browse files Browse the repository at this point in the history
  • Loading branch information
Enkidu93 committed Aug 20, 2024
1 parent 575641f commit 84e3240
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 22 deletions.
4 changes: 2 additions & 2 deletions src/SIL.Machine/Corpora/ParatextBackupTermsCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ public class ParatextBackupTermsCorpus : ParatextTermsCorpusBase
public ParatextBackupTermsCorpus(
ZipArchive archive,
IEnumerable<string> termCategories,
bool preferTermsLocalization = false
bool useTermGlosses = true
)
{
_archive = archive;
AddTexts(new ZipParatextProjectSettingsParser(archive).Parse(), termCategories, preferTermsLocalization);
AddTexts(new ZipParatextProjectSettingsParser(archive).Parse(), termCategories, useTermGlosses);
}

protected override bool Exists(string fileName)
Expand Down
23 changes: 13 additions & 10 deletions src/SIL.Machine/Corpora/ParatextTermsCorpusBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ protected void AddTexts(
if (
settings.LanguageCode != null
&& settings.BiblicalTermsListType == "Major"
&& !SupportedLanguageTermsLocalizationXmls.TryGetValue(settings.LanguageCode, out string resourceName)
&& SupportedLanguageTermsLocalizationXmls.TryGetValue(settings.LanguageCode, out string resourceName)
)
{
using (Stream keyTermsFile = Assembly.GetExecutingAssembly().GetManifestResourceStream(resourceName))
Expand All @@ -104,8 +104,7 @@ protected void AddTexts(
}
}

IDictionary<string, IReadOnlyList<string>> termsRenderings =
new Dictionary<string, IReadOnlyList<string>>();
IDictionary<string, IEnumerable<string>> termsRenderings = new Dictionary<string, IEnumerable<string>>();
if (termRenderingsDoc != null)
{
termsRenderings = termRenderingsDoc
Expand All @@ -120,10 +119,12 @@ protected void AddTexts(
IReadOnlyList<string> glosses = GetGlosses(gloss);
return (id, glosses);
})
.GroupBy(kvp => kvp.Item1, kvp => kvp.Item2) //Handle duplicate term ids (which do exist) e.g. שִׁלֵּמִי
.Select(grouping => (grouping.Key, grouping.SelectMany(g => g)))
.ToDictionary(kvp => kvp.Item1, kvp => kvp.Item2);
}

IDictionary<string, IReadOnlyList<string>> termsGlosses = new Dictionary<string, IReadOnlyList<string>>();
IDictionary<string, IEnumerable<string>> termsGlosses = new Dictionary<string, IEnumerable<string>>();
if (termsGlossesDoc != null && useTermGlosses)
{
termsGlosses = termsGlossesDoc
Expand All @@ -134,10 +135,12 @@ protected void AddTexts(
.Select(kvp =>
{
string id = kvp.Item1.Replace("\n", "&#xA");
string gloss = kvp.Item2.Element("Gloss").Value;
string gloss = kvp.Item2.Attribute("Gloss").Value;
IReadOnlyList<string> glosses = GetGlosses(gloss);
return (id, glosses);
})
.GroupBy(kvp => kvp.Item1, kvp => kvp.Item2)
.Select(grouping => (grouping.Key, grouping.SelectMany(g => g)))
.ToDictionary(kvp => kvp.Item1, kvp => kvp.Item2);
}
if (termsGlosses.Count > 0 || termsRenderings.Count > 0)
Expand All @@ -156,21 +159,21 @@ IDictionary<string, string> termIdToCategoryDictionary
}

private void AddTerms(
IDictionary<string, IReadOnlyList<string>> termsRenderings,
IDictionary<string, IReadOnlyList<string>> termsGlosses,
IDictionary<string, IEnumerable<string>> termsRenderings,
IDictionary<string, IEnumerable<string>> termsGlosses,
ParatextProjectSettings settings
)
{
string textId =
$"{settings.BiblicalTermsListType}:{settings.BiblicalTermsProjectName}:{settings.BiblicalTermsFileName}";

//Prefer renderings to gloss localizations
IDictionary<string, IReadOnlyList<string>> glosses = termsRenderings
.Concat(termsGlosses.Where(kvp => !termsGlosses.ContainsKey(kvp.Key)))
IDictionary<string, IEnumerable<string>> glosses = termsRenderings
.Concat(termsGlosses.Where(kvp => !termsRenderings.ContainsKey(kvp.Key)))
.ToDictionary(kvp => kvp.Key, kvp => kvp.Value);
IText text = new MemoryText(
textId,
glosses.Select(kvp => new TextRow(textId, kvp.Key) { Segment = kvp.Value })
glosses.Select(kvp => new TextRow(textId, kvp.Key) { Segment = kvp.Value.ToList() })
);
AddText(text);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ public MemoryParatextProjectTermsCorpus(
ParatextProjectSettings settings,
IEnumerable<string> termCategories,
Dictionary<string, string> files,
bool preferTermsLocalization = false
bool useTermGlosses = true
)
{
Files = files;
AddTexts(settings, termCategories, preferTermsLocalization);
AddTexts(settings, termCategories, useTermGlosses);
}

protected override bool Exists(string fileName)
Expand Down
32 changes: 24 additions & 8 deletions tests/SIL.Machine.Tests/Corpora/ParatextProjectTermsCorpusTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,28 @@ public void TestGetKeyTermsFromTermsLocalizations_NoTermRenderings()
new DefaultParatextProjectSettings(
biblicalTermsListType: "Major",
biblicalTermsFileName: "BiblicalTerms.xml"
)
),
useTermGlosses: true
);
IList<TextRow> rows = env.Corpus.GetRows().ToList();
Assert.That(rows.Count, Is.EqualTo(5726));
Assert.That(string.Join(" ", rows.First().Segment), Is.EqualTo("Abagtha"));
}

[Test]
public void TestGetKeyTermsFromTermsLocalizations_NoTermRenderings_DoNotUseTermGlosses()
{
var env = new TestEnvironment(
new DefaultParatextProjectSettings(
biblicalTermsListType: "Major",
biblicalTermsFileName: "BiblicalTerms.xml"
),
useTermGlosses: false
);
IList<TextRow> rows = env.Corpus.GetRows().ToList();
Assert.That(rows.Count, Is.EqualTo(0));
}

[Test]
public void TestGetKeyTermsFromTermsLocalizations_NoTermRenderings_PreferLocalization()
{
Expand All @@ -65,7 +80,7 @@ public void TestGetKeyTermsFromTermsLocalizations_NoTermRenderings_PreferLocaliz
biblicalTermsListType: "Major",
biblicalTermsFileName: "BiblicalTerms.xml"
),
preferTermsLocalization: true
useTermGlosses: true
);
IList<TextRow> rows = env.Corpus.GetRows().ToList();
Assert.That(rows.Count, Is.EqualTo(5726));
Expand All @@ -81,10 +96,10 @@ public void TestGetKeyTermsFromTermsLocalizations_()
biblicalTermsFileName: "BiblicalTerms.xml",
languageCode: "fr"
),
preferTermsLocalization: true
useTermGlosses: true
);
IList<TextRow> rows = env.Corpus.GetRows().ToList();
Assert.That(rows.Count, Is.EqualTo(5716));
Assert.That(rows.Count, Is.EqualTo(5715));
Assert.That(string.Join(" ", rows.First().Segment), Is.EqualTo("Aaron"));
}

Expand Down Expand Up @@ -112,11 +127,12 @@ public void TestGetKeyTermsFromTermsLocalizations_TermRenderingsExists_PreferLoc
</TermRenderingsList>"
}
},
preferTermsLocalization: true
useTermGlosses: true
);
IList<TextRow> rows = env.Corpus.GetRows().ToList();
Assert.That(rows.Count, Is.EqualTo(5726));
Assert.That(string.Join(" ", rows.First().Segment), Is.EqualTo("Abagtha"));
Assert.That(string.Join(" ", rows.First().Segment), Is.EqualTo("Xerxes"));
Assert.That(string.Join(" ", rows[2].Segment), Is.EqualTo("Abi"));
}

[Test]
Expand Down Expand Up @@ -149,15 +165,15 @@ public void TestGetGlosses(string glossString, IReadOnlyList<string> expectedOut
private class TestEnvironment(
ParatextProjectSettings? settings = null,
Dictionary<string, string>? files = null,
bool preferTermsLocalization = false
bool useTermGlosses = true
)
{
public MemoryParatextProjectTermsCorpus Corpus { get; } =
new MemoryParatextProjectTermsCorpus(
settings ?? new DefaultParatextProjectSettings(),
new string[] { "PN" },
files ?? new(),
preferTermsLocalization
useTermGlosses
);
}

Expand Down

0 comments on commit 84e3240

Please sign in to comment.