From fcb2c6507594a5a5b6bcf4770c4f055303dde12c Mon Sep 17 00:00:00 2001 From: John Maxwell Date: Fri, 27 Sep 2024 13:26:02 -0700 Subject: [PATCH] Address Damien's concerns --- .../CharacterDefinitionTable.cs | 111 +++++++------- .../Morpher.cs | 135 ++++++++++++++---- .../RootAllomorph.cs | 30 ++-- .../Segments.cs | 3 + .../XmlLanguageLoader.cs | 2 +- src/SIL.Machine/Annotations/Annotation.cs | 20 --- src/SIL.Machine/Annotations/ShapeNode.cs | 15 ++ .../HermitCrabTestBase.cs | 8 +- .../MorpherTests.cs | 6 +- 9 files changed, 206 insertions(+), 124 deletions(-) diff --git a/src/SIL.Machine.Morphology.HermitCrab/CharacterDefinitionTable.cs b/src/SIL.Machine.Morphology.HermitCrab/CharacterDefinitionTable.cs index d08d231d4..5905d3a86 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/CharacterDefinitionTable.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/CharacterDefinitionTable.cs @@ -105,7 +105,7 @@ public IEnumerable GetMatchingStrReps(ShapeNode node) } } - private bool GetShapeNodes(string str, out IEnumerable nodes, out int errorPos) + private bool GetShapeNodes(string str, bool allowPattern, out IEnumerable nodes, out int errorPos) { var nodesList = new List(); int i = 0; @@ -132,66 +132,70 @@ private bool GetShapeNodes(string str, out IEnumerable nodes, out int } if (match) continue; - - // Check for pattern language. - // NB: This only happens when the characters don't match. - if (normalized[i] == '[') + if (allowPattern) { - // Example: [Seg]. - // Look for a natural class. - int closePos = normalized.IndexOf("]", i); - if (closePos > 0) + // Check for pattern language. + // NB: This only happens when the characters don't match. + // I thought about implementing this using Pattern, + // but the Matcher doesn't preserve the unifications of the nodes. + if (normalized[i] == '[') { - string className = normalized.Substring(i + 1, closePos - i - 1); - if (_naturalClassLookup.ContainsKey(className)) + // Example: [Seg]. + // Look for a natural class. + int closePos = normalized.IndexOf("]", i); + if (closePos > 0) { - NaturalClass naturalClass = _naturalClassLookup[className]; - var node = new ShapeNode(naturalClass.FeatureStruct); - nodesList.Add(node); - i = closePos + 1; - continue; + string className = normalized.Substring(i + 1, closePos - i - 1); + if (_naturalClassLookup.ContainsKey(className)) + { + NaturalClass naturalClass = _naturalClassLookup[className]; + var node = new ShapeNode(naturalClass.FeatureStruct); + nodesList.Add(node); + i = closePos + 1; + continue; + } } } - } - else if (normalized[i] == '(') - { - if (i + 1 < normalized.Length && normalized[i + 1] == '[') + else if (normalized[i] == '(') { - // The natural class that follows is optional. - // Wait for the close parenthesis to process. - optional = true; - optionalPos = i; - optionalCount = nodesList.Count; - i++; - continue; + if (i + 1 < normalized.Length && normalized[i + 1] == '[') + { + // The natural class that follows is optional. + // Wait for the close parenthesis to process. + optional = true; + optionalPos = i; + optionalCount = nodesList.Count; + i++; + continue; + } } - } - else if (normalized[i] == ')') - { - if (optional && nodesList.Count == optionalCount + 1) + else if (normalized[i] == ')') { - // Example: ([Seg]). - // Ill-formed: ([C][V]). - // Make the last node optional. - nodesList[nodesList.Count - 1].Annotation.Optional = true; - optional = false; - i++; - continue; + if (optional && nodesList.Count == optionalCount + 1) + { + // Example: ([Seg]). + // Ill-formed: ([C][V]). + // Make the last node optional. + nodesList[nodesList.Count - 1].Annotation.Optional = true; + optional = false; + i++; + continue; + } } - } - else if (normalized[i] == '*') - { - if (i > 0 && normalized[i - 1] == ']') + else if (normalized[i] == '*') { - // Example: [Seg]*. - // Make the last node Kleene star. - nodesList[nodesList.Count - 1].Annotation.Optional = true; - nodesList[nodesList.Count - 1].Annotation.Iterative = true; - i++; - continue; + if (i > 0 && normalized[i - 1] == ']') + { + // Example: [Seg]*. + // Make the last node Kleene star. + nodesList[nodesList.Count - 1].Annotation.Optional = true; + nodesList[nodesList.Count - 1].Iterative = true; + i++; + continue; + } } + // Kleene plus doesn't work because '+' is a boundary marker. } - // Kleene plus doesn't work because '+' is a boundary marker. // Failure nodes = null; @@ -215,10 +219,15 @@ private bool GetShapeNodes(string str, out IEnumerable nodes, out int } public Shape Segment(string str) + { + return Segment(str, false); + } + + public Shape Segment(string str, bool allowPattern) { IEnumerable nodes; int errorPos; - if (GetShapeNodes(str, out nodes, out errorPos)) + if (GetShapeNodes(str, allowPattern, out nodes, out errorPos)) { var shape = new Shape(begin => new ShapeNode( begin ? HCFeatureSystem.LeftSideAnchor : HCFeatureSystem.RightSideAnchor @@ -241,7 +250,7 @@ public int TrySegment(string str, out Shape shape) { IEnumerable nodes; int errorPos; - if (GetShapeNodes(str, out nodes, out errorPos)) + if (GetShapeNodes(str, true, out nodes, out errorPos)) { shape = new Shape(begin => new ShapeNode( begin ? HCFeatureSystem.LeftSideAnchor : HCFeatureSystem.RightSideAnchor diff --git a/src/SIL.Machine.Morphology.HermitCrab/Morpher.cs b/src/SIL.Machine.Morphology.HermitCrab/Morpher.cs index 601ad0f5a..bb66bf825 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/Morpher.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/Morpher.cs @@ -14,6 +14,7 @@ #if !SINGLE_THREADED using System.Collections.Concurrent; using System.Threading.Tasks; +using System.Text; #endif namespace SIL.Machine.Morphology.HermitCrab @@ -371,36 +372,58 @@ private IEnumerable LexicalGuess(Word input) ); foreach (List match in MatchNodesWithPattern(shapeNodes.ToList(), shapePattern.ToList())) { - // Create a root allomorph for the guess. - string shapeString = match.ToString(table, false); - if (shapeSet.Contains(shapeString)) - // Avoid duplicates caused by multiple paths through pattern (e.g. ([Seg])([Seg])). - continue; - shapeSet.Add(shapeString); - var root = new RootAllomorph(new Segments(table, shapeString)) { Guessed = true }; - // Create a lexical entry to hold the root allomorph. - // (The root's Morpheme will point to the lexical entry.) - var lexEntry = new LexEntry + IEnumerable shapeStrings = new List() { match.ToString(table, false) }; + // We could set shapeStrings to GetShapeStrings(match, table), + // but that produces spurious ambiguities that don't seem to have any value. + foreach (string shapeString in shapeStrings) { - Id = shapeString, - SyntacticFeatureStruct = input.SyntacticFeatureStruct, - Gloss = shapeString, - Stratum = input.Stratum, - IsPartial = input.SyntacticFeatureStruct.IsEmpty - }; - lexEntry.Allomorphs.Add(root); - // Point the root allomorph to the lexical pattern in FieldWorks. - if (lexicalPattern.Properties.ContainsKey("ID")) - root.Properties["ID"] = lexicalPattern.Properties["ID"]; - if (lexicalPattern.Morpheme != null && lexicalPattern.Morpheme.Properties.ContainsKey("ID")) - root.Morpheme.Properties["ID"] = lexicalPattern.Morpheme.Properties["ID"]; - // Create a new word that uses the root allomorph. - Word newWord = input.Clone(); - newWord.RootAllomorph = root; - if (_traceManager.IsTracing) - _traceManager.SynthesizeWord(_lang, newWord); - newWord.Freeze(); - yield return newWord; + if (shapeSet.Contains(shapeString)) + // Avoid duplicates caused by multiple paths through pattern (e.g. ([Seg])([Seg])). + continue; + shapeSet.Add(shapeString); + // Create a root allomorph for the guess. + var root = new RootAllomorph(new Segments(table, shapeString)) { Guessed = true }; + root.AllomorphCoOccurrenceRules.AddRange(lexicalPattern.AllomorphCoOccurrenceRules); + root.Environments.AddRange(lexicalPattern.Environments); + root.Properties.AddRange(lexicalPattern.Properties); + root.StemName = lexicalPattern.StemName; + root.IsBound = lexicalPattern.IsBound; + // Create a lexical entry to hold the root allomorph. + // (The root's Morpheme will point to the lexical entry.) + var lexEntry = new LexEntry + { + Id = shapeString, + Gloss = shapeString, + IsPartial = input.SyntacticFeatureStruct.IsEmpty, + SyntacticFeatureStruct = input.SyntacticFeatureStruct, + Stratum = input.Stratum, + }; + lexEntry.Allomorphs.Add(root); + // Point the root allomorph to the lexical pattern in FieldWorks. + if (lexicalPattern.Morpheme != null) + { + // Copy Morpheme fields. + Morpheme morpheme = lexicalPattern.Morpheme; + lexEntry.MorphemeCoOccurrenceRules.AddRange(morpheme.MorphemeCoOccurrenceRules); + lexEntry.Properties.AddRange(morpheme.Properties); + lexEntry.Stratum = morpheme.Stratum; + LexEntry patternEntry = (LexEntry)morpheme; + if (patternEntry != null) + { + // Copy LexEntry fields. + lexEntry.MprFeatures = patternEntry.MprFeatures; + lexEntry.SyntacticFeatureStruct = patternEntry.SyntacticFeatureStruct; + lexEntry.IsPartial = patternEntry.IsPartial; + } + } + // Create a new word that uses the root allomorph. + Word newWord = input.Clone(); + newWord.RootAllomorph = root; + if (_traceManager.IsTracing) + _traceManager.SynthesizeWord(_lang, newWord); + newWord.Freeze(); + yield return newWord; + } } } } @@ -443,7 +466,7 @@ public IEnumerable> MatchNodesWithPattern( return results; // Make a copy of prefix to avoid crosstalk and add newNode. prefix = new List(prefix) { newNode }; - if (pattern[p].Annotation.Iterative) + if (pattern[p].Iterative) // Try using this item in the pattern again. results.AddRange(MatchNodesWithPattern(nodes, pattern, n + 1, p, true, prefix)); // Try the remainder of the nodes against the remainder of the pattern. @@ -462,6 +485,58 @@ ShapeNode UnifyShapeNodes(ShapeNode node, ShapeNode pattern) return new ShapeNode(fs); } + private IEnumerable GetShapeStrings(IList nodes, CharacterDefinitionTable table) + { + IList strings = new List(); + if (nodes.Count == 0) + { + // We are at the end of the nodes. + strings.Add(""); + return strings; + } + + // Pop the first node. + ShapeNode node = nodes[0]; + nodes.RemoveAt(0); + + // Get suffixes. + IEnumerable suffixes = GetShapeStrings(nodes, table); + if ((node.Annotation.Type() == HCFeatureSystem.Boundary) || node.IsDeleted()) + // Skip this node. + return suffixes; + IEnumerable strReps = table.GetMatchingStrReps(node); + if (strReps.Count() == 0) + // Skip this node; + return suffixes; + + // Get string reps with unique feature structures. + IList uniqueStrReps = new List(); + foreach (string strRep in strReps) + { + CharacterDefinition cd = table[strRep]; + bool found = false; + foreach (string uniqueStrRep in uniqueStrReps) + { + CharacterDefinition uniqueCd = table[uniqueStrRep]; + if (uniqueCd.FeatureStruct.ValueEquals(cd.FeatureStruct)) + { + found = true; + break; + } + } + if (!found) + uniqueStrReps.Add(strRep); + } + + // take the cross-product of uniqueStrReps and suffixes. + foreach (string uniqueStrRep in uniqueStrReps) + { + foreach (string suffix in suffixes) + strings.Add(uniqueStrRep + suffix); + } + return strings; + } + private bool IsWordValid(Word word) { if ( diff --git a/src/SIL.Machine.Morphology.HermitCrab/RootAllomorph.cs b/src/SIL.Machine.Morphology.HermitCrab/RootAllomorph.cs index b0bb6d71d..c6f68194f 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/RootAllomorph.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/RootAllomorph.cs @@ -1,4 +1,5 @@ -using System.Linq; +using SIL.Machine.Annotations; +using System.Linq; namespace SIL.Machine.Morphology.HermitCrab { @@ -9,12 +10,24 @@ public class RootAllomorph : Allomorph { private readonly Segments _segments; + /// /// Initializes a new instance of the class. /// public RootAllomorph(Segments segments) { _segments = segments; + foreach (ShapeNode node in _segments.Shape.GetNodes(_segments.Shape.Range)) + { + if ( + node.Iterative + || (node.Annotation.Optional && node.Annotation.Type() != HCFeatureSystem.Boundary) + ) + { + IsPattern = true; + } + } + } /// @@ -34,20 +47,7 @@ public Segments Segments /// public bool IsPattern { - get - { - foreach (var node in _segments.Shape.GetNodes(_segments.Shape.Range)) - { - if ( - node.Annotation.Iterative - || (node.Annotation.Optional && node.Annotation.Type() != HCFeatureSystem.Boundary) - ) - { - return true; - } - } - return false; - } + get; private set; } protected override bool ConstraintsEqual(Allomorph other) diff --git a/src/SIL.Machine.Morphology.HermitCrab/Segments.cs b/src/SIL.Machine.Morphology.HermitCrab/Segments.cs index fa6152d83..dca5ee474 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/Segments.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/Segments.cs @@ -11,6 +11,9 @@ public class Segments public Segments(CharacterDefinitionTable table, string representation) : this(table, representation, table.Segment(representation)) { } + public Segments(CharacterDefinitionTable table, string representation, bool allowPattern) + : this(table, representation, table.Segment(representation, allowPattern)) { } + public Segments(CharacterDefinitionTable table, string representation, Shape shape) { _representation = representation; diff --git a/src/SIL.Machine.Morphology.HermitCrab/XmlLanguageLoader.cs b/src/SIL.Machine.Morphology.HermitCrab/XmlLanguageLoader.cs index fc2b01ae3..91801135a 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/XmlLanguageLoader.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/XmlLanguageLoader.cs @@ -498,7 +498,7 @@ private bool TryLoadLexEntry(XElement entryElem, CharacterDefinitionTable table, private RootAllomorph LoadRootAllomorph(XElement alloElem, CharacterDefinitionTable table) { var shapeStr = (string)alloElem.Element("PhoneticShape"); - Segments segments = new Segments(table, shapeStr); + Segments segments = new Segments(table, shapeStr, true); if (segments.Shape.All(n => n.Type() == HCFeatureSystem.Boundary)) throw new InvalidShapeException(shapeStr, 0); var allomorph = new RootAllomorph(segments) { IsBound = (bool?)alloElem.Attribute("isBound") ?? false }; diff --git a/src/SIL.Machine/Annotations/Annotation.cs b/src/SIL.Machine/Annotations/Annotation.cs index e0dc10d46..617f94f96 100644 --- a/src/SIL.Machine/Annotations/Annotation.cs +++ b/src/SIL.Machine/Annotations/Annotation.cs @@ -19,7 +19,6 @@ public class Annotation private int _hashCode; private FeatureStruct _fs; private bool _optional; - private bool _iterative; private object _data; public Annotation(Range range, FeatureStruct fs) @@ -41,7 +40,6 @@ protected Annotation(Annotation ann) : this(ann.Range, ann.FeatureStruct.Clone()) { Optional = ann.Optional; - Iterative = ann.Iterative; _data = ann._data; if (ann._children != null && ann._children.Count > 0) Children.AddRange(ann.Children.Select(node => node.Clone())); @@ -130,22 +128,6 @@ public bool Optional } } - /// - /// Gets or sets a value indicating whether this annotation is iterative. - /// This is used in lexical patterns such as [Seg]*: - /// - /// - /// true if this annotation is iterative, otherwise false. - /// - public bool Iterative - { - get { return _iterative; } - set - { - CheckFrozen(); - _iterative = value; - } - } internal int ListID { get; set; } public bool Remove(bool preserveChildren) @@ -206,7 +188,6 @@ public void Freeze() _hashCode = _hashCode * 31 + _fs.GetFrozenHashCode(); _hashCode = _hashCode * 31 + (_children == null ? 0 : _children.GetFrozenHashCode()); _hashCode = _hashCode * 31 + _optional.GetHashCode(); - _hashCode = _hashCode * 31 + _iterative.GetHashCode(); _hashCode = _hashCode * 31 + Range.GetHashCode(); } @@ -223,7 +204,6 @@ public bool ValueEquals(Annotation other) return _fs.ValueEquals(other._fs) && _optional == other._optional - && _iterative == other._iterative && Range == other.Range; } diff --git a/src/SIL.Machine/Annotations/ShapeNode.cs b/src/SIL.Machine/Annotations/ShapeNode.cs index cf72e7c8f..ad4ff1111 100644 --- a/src/SIL.Machine/Annotations/ShapeNode.cs +++ b/src/SIL.Machine/Annotations/ShapeNode.cs @@ -45,6 +45,21 @@ public Annotation Annotation get { return _ann; } } + /// + /// Whether this is an iterative node in a lexical pattern. + /// + public bool Iterative + { + get { return Annotation.Data != null; } + set + { + if (value) + Annotation.Data = value; + else + Annotation.Data = null; + } + } + public int CompareTo(ShapeNode other) { if (other.List != List) diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/HermitCrabTestBase.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/HermitCrabTestBase.cs index 734bcf852..d6b56ed60 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/HermitCrabTestBase.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/HermitCrabTestBase.cs @@ -795,10 +795,6 @@ public void FixtureSetUp() entry = AddEntry("bound", FeatureStruct.New(syntacticFeatSys).Symbol("V").Value, Morphophonemic, "dag"); entry.PrimaryAllomorph.IsBound = true; - var naturalClass = new NaturalClass(new FeatureStruct()) { Name = "Any" }; - Morphophonemic.CharacterDefinitionTable.AddNaturalClass(naturalClass); - AddEntry("pattern", new FeatureStruct(), Morphophonemic, "[Any]*"); - Language = new Language { Name = "Test", @@ -819,7 +815,7 @@ public void TestCleanup() } } - private LexEntry AddEntry(string gloss, FeatureStruct syntacticFS, Stratum stratum, params string[] forms) + public LexEntry AddEntry(string gloss, FeatureStruct syntacticFS, Stratum stratum, params string[] forms) { var entry = new LexEntry { @@ -829,7 +825,7 @@ private LexEntry AddEntry(string gloss, FeatureStruct syntacticFS, Stratum strat IsPartial = syntacticFS.IsEmpty }; foreach (string form in forms) - entry.Allomorphs.Add(new RootAllomorph(new Segments(stratum.CharacterDefinitionTable, form))); + entry.Allomorphs.Add(new RootAllomorph(new Segments(stratum.CharacterDefinitionTable, form, true))); stratum.Entries.Add(entry); Entries[gloss] = entry; return entry; diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs index db4443e3a..a01ec12c0 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs @@ -85,6 +85,10 @@ public void AnalyzeWord_CanGuess_ReturnsCorrectAnalysis() ); Morphophonemic.MorphologicalRules.Add(edSuffix); + var naturalClass = new NaturalClass(new FeatureStruct()) { Name = "Any" }; + Morphophonemic.CharacterDefinitionTable.AddNaturalClass(naturalClass); + AddEntry("pattern", new FeatureStruct(), Morphophonemic, "[Any]*"); + var morpher = new Morpher(TraceManager, Language); Assert.That(morpher.AnalyzeWord("gag"), Is.Empty); @@ -273,7 +277,7 @@ public void TestMatchNodesWithPattern() IList GetNodes(string pattern) { // Use Table2 because it has boundaries defined. - Shape shape = new Segments(Table2, pattern).Shape; + Shape shape = new Segments(Table2, pattern, true).Shape; return shape.GetNodes(shape.Range).ToList(); } }