From ff75f8d18a462f4ab794dcff17d760217b6d213a Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 11 Oct 2024 10:04:56 -0400 Subject: [PATCH] Add parameter for filtering key terms by book/chapters --- .../Corpora/ParatextBackupTermsCorpus.cs | 5 +- .../Corpora/ParatextProjectTermsParserBase.cs | 46 ++++++++++++++++++- .../ParatextProjectTermsParserTests.cs | 32 +++++++++++-- 3 files changed, 77 insertions(+), 6 deletions(-) diff --git a/src/SIL.Machine/Corpora/ParatextBackupTermsCorpus.cs b/src/SIL.Machine/Corpora/ParatextBackupTermsCorpus.cs index 1e111b38..46fbdaf7 100644 --- a/src/SIL.Machine/Corpora/ParatextBackupTermsCorpus.cs +++ b/src/SIL.Machine/Corpora/ParatextBackupTermsCorpus.cs @@ -9,7 +9,8 @@ public class ParatextBackupTermsCorpus : DictionaryTextCorpus public ParatextBackupTermsCorpus( string fileName, IEnumerable termCategories, - bool useTermGlosses = true + bool useTermGlosses = true, + IDictionary> chapters = null ) { using (var archive = ZipFile.OpenRead(fileName)) @@ -18,7 +19,7 @@ public ParatextBackupTermsCorpus( IEnumerable<(string, IReadOnlyList)> glosses = new ZipParatextProjectTermsParser( archive, settings - ).Parse(termCategories, useTermGlosses); + ).Parse(termCategories, useTermGlosses, chapters); string textId = $"{settings.BiblicalTermsListType}:{settings.BiblicalTermsProjectName}:{settings.BiblicalTermsFileName}"; diff --git a/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs index ec68b400..b58ce3c6 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs @@ -7,6 +7,7 @@ using System.Text.RegularExpressions; using System.Xml.Linq; using SIL.Extensions; +using SIL.Scripture; namespace SIL.Machine.Corpora { @@ -49,11 +50,13 @@ protected ParatextProjectTermsParserBase(ParatextProjectSettingsParserBase setti public IEnumerable<(string TermId, IReadOnlyList Glosses)> Parse( IEnumerable termCategories, - bool useTermGlosses = true + bool useTermGlosses = true, + IDictionary> chapters = null ) { XDocument biblicalTermsDoc; IDictionary termIdToCategoryDictionary; + IDictionary> termIdToReferences; if (_settings.BiblicalTermsListType == "Project") { if (Exists(_settings.BiblicalTermsFileName)) @@ -62,6 +65,7 @@ protected ParatextProjectTermsParserBase(ParatextProjectSettingsParserBase setti { biblicalTermsDoc = XDocument.Load(keyTermsFile); termIdToCategoryDictionary = GetCategoryPerId(biblicalTermsDoc); + termIdToReferences = GetReferences(biblicalTermsDoc); } } else @@ -74,6 +78,7 @@ protected ParatextProjectTermsParserBase(ParatextProjectSettingsParserBase setti { biblicalTermsDoc = XDocument.Load(keyTermsFile); termIdToCategoryDictionary = GetCategoryPerId(biblicalTermsDoc); + termIdToReferences = GetReferences(biblicalTermsDoc); } } } @@ -87,11 +92,13 @@ protected ParatextProjectTermsParserBase(ParatextProjectSettingsParserBase setti { biblicalTermsDoc = XDocument.Load(keyTermsFile); termIdToCategoryDictionary = GetCategoryPerId(biblicalTermsDoc); + termIdToReferences = GetReferences(biblicalTermsDoc); } } else { termIdToCategoryDictionary = new Dictionary(); + termIdToReferences = new Dictionary>(); } XDocument termsGlossesDoc = null; @@ -124,6 +131,7 @@ protected ParatextProjectTermsParserBase(ParatextProjectSettingsParserBase setti .Where(n => n.Name.LocalName == "TermRendering") .Select(ele => (ele.Attribute("Id").Value, ele)) .Where(kvp => IsInCategory(kvp.Item1, termCategories, termIdToCategoryDictionary)) + .Where(kvp => IsInChapters(kvp.Item1, chapters, termIdToReferences)) .Select(kvp => { string id = kvp.Item1.Replace("\n", " "); @@ -144,6 +152,7 @@ protected ParatextProjectTermsParserBase(ParatextProjectSettingsParserBase setti .Where(n => n.Name.LocalName == "Localization") .Select(ele => (ele.Attribute("Id").Value, ele)) .Where(kvp => IsInCategory(kvp.Item1, termCategories, termIdToCategoryDictionary)) + .Where(kvp => IsInChapters(kvp.Item1, chapters, termIdToReferences)) .Select(kvp => { string id = kvp.Item1.Replace("\n", " "); @@ -175,6 +184,24 @@ IDictionary termIdToCategoryDictionary || (termIdToCategoryDictionary.TryGetValue(id, out category) && termCategories.Contains(category)); } + private static bool IsInChapters( + string id, + IDictionary> chapters, + IDictionary> termIdToReferences + ) + { + ImmutableHashSet verseRefs; + return termIdToReferences.Count() == 0 + || chapters == null + || ( + termIdToReferences.TryGetValue(id, out verseRefs) + && verseRefs.Any(vr => + chapters.TryGetValue(vr.Book, out HashSet bookChapters) + && (bookChapters.Count() == 0 || bookChapters.Contains(vr.ChapterNum)) + ) + ); + } + public static IReadOnlyList GetGlosses(string gloss) { //If entire term rendering is surrounded in square brackets, remove them @@ -243,6 +270,23 @@ private static IDictionary GetCategoryPerId(XDocument biblicalTe .ToDictionary(e => e.Attribute("Id").Value, e => e.Element("Category")?.Value ?? ""); } + private static IDictionary> GetReferences(XDocument biblicalTermsDocument) + { + return biblicalTermsDocument + .Descendants() + .Where(n => n.Name.LocalName == "Term") + .DistinctBy(e => e.Attribute("Id").Value) + .ToDictionary( + e => e.Attribute("Id").Value, + e => + e.Element("References") + ?.Descendants() + .Where(reference => int.TryParse(reference.Value.Substring(0, 9), out int _)) + .Select(reference => new VerseRef(int.Parse(reference.Value.Substring(0, 9)))) + .ToImmutableHashSet() + ); + } + protected abstract Stream Open(string fileName); protected abstract bool Exists(string fileName); diff --git a/tests/SIL.Machine.Tests/Corpora/ParatextProjectTermsParserTests.cs b/tests/SIL.Machine.Tests/Corpora/ParatextProjectTermsParserTests.cs index fe895874..54034b9e 100644 --- a/tests/SIL.Machine.Tests/Corpora/ParatextProjectTermsParserTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/ParatextProjectTermsParserTests.cs @@ -73,7 +73,7 @@ public void TestGetKeyTermsFromTermsLocalizations_NoTermRenderings_DoNotUseTermG } [Test] - public void TestGetKeyTermsFromTermsLocalizations_() + public void TestGetKeyTermsFromTermsLocalizations() { var env = new TestEnvironment( new DefaultParatextProjectSettings( @@ -88,6 +88,29 @@ public void TestGetKeyTermsFromTermsLocalizations_() Assert.That(string.Join(" ", terms.First().Glosses), Is.EqualTo("Aaron")); } + [Test] + public void TestGetKeyTermsFromTermsLocalizations_FilterByChapters() + { + var env = new TestEnvironment( + new DefaultParatextProjectSettings( + biblicalTermsListType: "Major", + biblicalTermsFileName: "BiblicalTerms.xml", + languageCode: "fr" + ), + useTermGlosses: true, + chapters: new Dictionary>() + { + { + "HAB", + new() { 1 } + } + } + ); + IEnumerable<(string TermId, IReadOnlyList Glosses)> terms = env.GetGlosses(); + Assert.That(terms.Count, Is.EqualTo(3)); //Habakkuk, YHWH, Kashdi/Chaldean are the only PN terms in HAB 1 + Assert.That(string.Join(" ", terms.First().Glosses), Is.EqualTo("Habaquq")); + } + [Test] public void TestGetKeyTermsFromTermsLocalizations_TermRenderingsExists_PreferLocalization() { @@ -150,16 +173,19 @@ public void TestGetGlosses(string glossString, IReadOnlyList expectedOut private class TestEnvironment( ParatextProjectSettings? settings = null, Dictionary? files = null, - bool useTermGlosses = true + bool useTermGlosses = true, + IDictionary>? chapters = null ) { private readonly bool _useTermGlosses = useTermGlosses; + private readonly IDictionary>? _chapters = chapters; + public ParatextProjectTermsParserBase Parser { get; } = new MemoryParatextProjectTermsParser(settings ?? new DefaultParatextProjectSettings(), files ?? new()); public IEnumerable<(string TermId, IReadOnlyList Glosses)> GetGlosses() { - return Parser.Parse(new string[] { "PN" }, _useTermGlosses); + return Parser.Parse(new string[] { "PN" }, _useTermGlosses, _chapters); } }