From 04d94fef59fd10ccb772902cc03b320bd9bfdf68 Mon Sep 17 00:00:00 2001 From: John Maxwell Date: Thu, 12 Sep 2024 13:46:32 -0700 Subject: [PATCH] Improve root guesser --- .../Morpher.cs | 150 +++++++++++++----- .../RootAllomorph.cs | 10 ++ src/SIL.Machine/Annotations/Annotation.cs | 22 ++- .../MorpherTests.cs | 78 ++++++++- 4 files changed, 218 insertions(+), 42 deletions(-) diff --git a/src/SIL.Machine.Morphology.HermitCrab/Morpher.cs b/src/SIL.Machine.Morphology.HermitCrab/Morpher.cs index f0af12e70..52ebeb35b 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/Morpher.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/Morpher.cs @@ -26,6 +26,7 @@ public class Morpher : IMorphologicalAnalyzer, IMorphologicalGenerator private readonly Dictionary _allomorphTries; private readonly ITraceManager _traceManager; private readonly ReadOnlyObservableCollection _morphemes; + private readonly IList _lexicalPatterns = new List(); public Morpher(ITraceManager traceManager, Language lang) { @@ -38,7 +39,12 @@ public Morpher(ITraceManager traceManager, Language lang) var allomorphs = new HashSet(stratum.Entries.SelectMany(entry => entry.Allomorphs)); var trie = new RootAllomorphTrie(ann => ann.Type() == HCFeatureSystem.Segment); foreach (RootAllomorph allomorph in allomorphs) - trie.Add(allomorph); + { + if (allomorph.IsPattern) + _lexicalPatterns.Add(allomorph); + else + trie.Add(allomorph); + } _allomorphTries[stratum] = trie; morphemes.AddRange(stratum.Entries); @@ -49,7 +55,6 @@ public Morpher(ITraceManager traceManager, Language lang) _synthesisRule = lang.CompileSynthesisRule(this); MaxStemCount = 2; MaxUnapplications = 0; - GuessRoot = false; LexEntrySelector = entry => true; RuleSelector = rule => true; @@ -72,11 +77,6 @@ public ITraceManager TraceManager /// public int MaxUnapplications { get; set; } - /// - /// When GuessRoot is true, guess LexEntries for the roots of the analyses. - /// - public bool GuessRoot { get; set; } - public Func LexEntrySelector { get; set; } public Func RuleSelector { get; set; } @@ -85,15 +85,29 @@ public Language Language get { return _lang; } } + public IList LexicalPatterns + { + get { return _lexicalPatterns; } + } + /// /// Parses the specified surface form. /// public IEnumerable ParseWord(string word) { - return ParseWord(word, out _); + return ParseWord(word, out _, false); } public IEnumerable ParseWord(string word, out object trace) + { + return ParseWord(word, out trace, false); + } + + /// + /// Parse the specified surface form, possibly tracing the parse. + /// If there are no analyses and guessRoot is true, then guess the root. + /// + public IEnumerable ParseWord(string word, out object trace, bool guessRoot) { // convert the word to its phonetic shape Shape shape = _lang.SurfaceStratum.CharacterDefinitionTable.Segment(word); @@ -118,9 +132,9 @@ public IEnumerable ParseWord(string word, out object trace) File.WriteAllLines("analyses.txt", lines.OrderBy(l => l)); #endif - var origAnalyses = GuessRoot ? analyses.ToList() : null; + var origAnalyses = guessRoot ? analyses.ToList() : null; var syntheses = Synthesize(word, analyses); - if (GuessRoot && syntheses.Count() == 0) + if (guessRoot && syntheses.Count() == 0) { // Guess roots when there are no results. List matches = new List(); @@ -350,43 +364,93 @@ private IEnumerable LexicalGuess(Word input) { if (_traceManager.IsTracing) _traceManager.LexicalLookup(input.Stratum, input); - var table = input.Stratum.CharacterDefinitionTable; - var allRange = Range.Create(input.Shape.First, input.Shape.Last); - var shapeStrings = EnumerateShapeStrings(input.Shape.GetNodes(allRange).ToList(), 0, "", table); - foreach (string shapeString in shapeStrings) + CharacterDefinitionTable table = input.Stratum.CharacterDefinitionTable; + IEnumerable shapeNodes = input.Shape.GetNodes(input.Range); + foreach (RootAllomorph lexicalPattern in _lexicalPatterns) { - var lexEntry = new LexEntry + IEnumerable shapePattern = lexicalPattern.Segments.Shape.GetNodes(lexicalPattern.Segments.Shape.Range); + foreach (List match in MatchNodesWithPattern(shapeNodes.ToList(), shapePattern.ToList())) { - Id = shapeString, - SyntacticFeatureStruct = input.SyntacticFeatureStruct, - Gloss = shapeString, - Stratum = input.Stratum, - IsPartial = input.SyntacticFeatureStruct.IsEmpty - }; - var root = new RootAllomorph(new Segments(table, shapeString)); - lexEntry.Allomorphs.Add(root); - Word newWord = input.Clone(); - newWord.RootAllomorph = root; - if (_traceManager.IsTracing) - _traceManager.SynthesizeWord(_lang, newWord); - newWord.Freeze(); - yield return newWord; + // Create a root allomorph for the guess. + string shapeString = match.ToString(table, false); + var root = new RootAllomorph(new Segments(table, shapeString)) + { + Guessed = true + }; + // Point the root allomorph to the lexical pattern in FieldWorks. + if (lexicalPattern.Properties.ContainsKey("ID")) + root.Properties["ID"] = lexicalPattern.Properties["ID"]; + if (lexicalPattern.Morpheme != null && lexicalPattern.Morpheme.Properties.ContainsKey("ID")) + root.Morpheme.Properties["ID"] = lexicalPattern.Morpheme.Properties["ID"]; + // Create a lexical entry to hold the root allomorph. + // (The root allmorph will point to the lexical entry.) + var lexEntry = new LexEntry + { + Id = shapeString, + SyntacticFeatureStruct = input.SyntacticFeatureStruct, + Gloss = shapeString, + Stratum = input.Stratum, + IsPartial = input.SyntacticFeatureStruct.IsEmpty + }; + lexEntry.Allomorphs.Add(root); + // Create a new word that uses the root allomorph. + Word newWord = input.Clone(); + newWord.RootAllomorph = root; + if (_traceManager.IsTracing) + _traceManager.SynthesizeWord(_lang, newWord); + newWord.Freeze(); + yield return newWord; + } } } - IEnumerable EnumerateShapeStrings(IList nodes, int index, string prefix, CharacterDefinitionTable table) + public IEnumerable> MatchNodesWithPattern(IList nodes, IList pattern, + int n = 0, int p = 0, bool obligatory = false, List prefix = null) { - if (index == nodes.Count) + var results = new List>(); + if (prefix == null) + prefix = new List(); + if (pattern.Count() == p) { - return new List { prefix }; + if (nodes.Count() == n) + // We match because we are at the end of both the pattern and the nodes. + results.Add(prefix); + return results; } - string[] strReps = table.GetMatchingStrReps(nodes[index]).ToArray(); - List strings = new List(); - foreach (string strRep in strReps) + if (pattern[p].Annotation.Optional && !obligatory) + // Try skipping this item in the pattern. + results.AddRange(MatchNodesWithPattern(nodes, pattern, n, p + 1, false, prefix)); + if (nodes.Count() == n) { - strings.AddRange(EnumerateShapeStrings(nodes, index + 1, prefix + strRep, table)); + // We fail to match because we are at the end of the nodes but not the pattern. + return results; } - return strings; + ShapeNode newNode = UnifyShapeNodes(nodes[n], pattern[p]); + if (newNode == null) + // We fail because the pattern didn't match the node here. + return results; + // Make a copy of prefix to avoid crosstalk and add newNode. + prefix = new List(prefix) + { + newNode + }; + if (pattern[p].Annotation.Iterative) + // Try using this item in the pattern again. + results.AddRange(MatchNodesWithPattern(nodes, pattern, n + 1, p, true, prefix)); + // Try the remainder of the nodes against the remainder of the pattern. + results.AddRange(MatchNodesWithPattern(nodes, pattern, n + 1, p + 1, false, prefix)); + return results; + } + + ShapeNode UnifyShapeNodes(ShapeNode node, ShapeNode pattern) + { + FeatureStruct fs = null; + node.Annotation.FeatureStruct.Unify(pattern.Annotation.FeatureStruct, out fs); + if (fs == null) + return null; + if (fs.ValueEquals(node.Annotation.FeatureStruct)) + return node; + return new ShapeNode(fs); } private bool IsWordValid(Word word) @@ -459,6 +523,18 @@ public IEnumerable AnalyzeWord(string word) } } + public IEnumerable AnalyzeWord(string word, bool guessRoot) + { + try + { + return ParseWord(word, out _, guessRoot).Select(CreateWordAnalysis); + } + catch (InvalidShapeException) + { + return Enumerable.Empty(); + } + } + private WordAnalysis CreateWordAnalysis(Word result) { int rootMorphemeIndex = -1; diff --git a/src/SIL.Machine.Morphology.HermitCrab/RootAllomorph.cs b/src/SIL.Machine.Morphology.HermitCrab/RootAllomorph.cs index 76d480bf3..1dacb8057 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/RootAllomorph.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/RootAllomorph.cs @@ -29,6 +29,16 @@ public Segments Segments public bool IsBound { get; set; } + /// + /// Does this represent a lexical pattern (e.g. [Seg]+)? + /// + public bool IsPattern { get; set; } + + /// + /// Was this allomorph guessed by a lexical pattern? + /// + public bool Guessed { get; set; } + protected override bool ConstraintsEqual(Allomorph other) { if (!(other is RootAllomorph otherAllo)) diff --git a/src/SIL.Machine/Annotations/Annotation.cs b/src/SIL.Machine/Annotations/Annotation.cs index 7292fc591..25c14caa2 100644 --- a/src/SIL.Machine/Annotations/Annotation.cs +++ b/src/SIL.Machine/Annotations/Annotation.cs @@ -19,6 +19,7 @@ public class Annotation private int _hashCode; private FeatureStruct _fs; private bool _optional; + private bool _iterative; private object _data; public Annotation(Range range, FeatureStruct fs) @@ -40,6 +41,7 @@ protected Annotation(Annotation ann) : this(ann.Range, ann.FeatureStruct.Clone()) { Optional = ann.Optional; + Iterative = ann.Iterative; _data = ann._data; if (ann._children != null && ann._children.Count > 0) Children.AddRange(ann.Children.Select(node => node.Clone())); @@ -128,6 +130,23 @@ public bool Optional } } + /// + /// Gets or sets a value indicating whether this annotation is iterative. + /// This is used in lexical patterns such as [Seg]+: + /// Kleene star = iterative and optional, Kleene plus = iterative and not optional. + /// + /// + /// true if this annotation is iterative, otherwise false. + /// + public bool Iterative + { + get { return _iterative; } + set + { + CheckFrozen(); + _iterative = value; + } + } internal int ListID { get; set; } public bool Remove(bool preserveChildren) @@ -188,6 +207,7 @@ public void Freeze() _hashCode = _hashCode * 31 + _fs.GetFrozenHashCode(); _hashCode = _hashCode * 31 + (_children == null ? 0 : _children.GetFrozenHashCode()); _hashCode = _hashCode * 31 + _optional.GetHashCode(); + _hashCode = _hashCode * 31 + _iterative.GetHashCode(); _hashCode = _hashCode * 31 + Range.GetHashCode(); } @@ -202,7 +222,7 @@ public bool ValueEquals(Annotation other) if (!IsLeaf && !_children.ValueEquals(other._children)) return false; - return _fs.ValueEquals(other._fs) && _optional == other._optional && Range == other.Range; + return _fs.ValueEquals(other._fs) && _optional == other._optional && _iterative == other._iterative && Range == other.Range; } public int GetFrozenHashCode() diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs index 8e7e75033..98e3931cc 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs @@ -85,14 +85,22 @@ public void AnalyzeWord_CanGuess_ReturnsCorrectAnalysis() ); Morphophonemic.MorphologicalRules.Add(edSuffix); + // Make a lexical pattern equivalent to Any+. + ShapeNode node = new ShapeNode(new FeatureStruct()); + node.Annotation.Optional = true; + node.Annotation.Iterative = true; + var shape = new Shape(begin => new ShapeNode(begin ? HCFeatureSystem.LeftSideAnchor : HCFeatureSystem.RightSideAnchor)); + shape.AddRange(new List { node }); + var lexicalPattern = new RootAllomorph(new Segments(Table1, "", shape)); + var morpher = new Morpher(TraceManager, Language); + morpher.LexicalPatterns.Add(lexicalPattern); + Assert.That(morpher.AnalyzeWord("gag"), Is.Empty); Assert.That(morpher.AnalyzeWord("gagd"), Is.Empty); - - morpher.GuessRoot = true; - var analyses = morpher.AnalyzeWord("gag").ToList(); + var analyses = morpher.AnalyzeWord("gag", true).ToList(); Assert.That(analyses[0].ToString(), Is.EquivalentTo("[*gag]")); - var analyses2 = morpher.AnalyzeWord("gagd").ToList(); + var analyses2 = morpher.AnalyzeWord("gagd", true).ToList(); Assert.That(analyses2[0].ToString(), Is.EquivalentTo("[*gag ed_suffix]")); } @@ -167,4 +175,66 @@ public void GenerateWords_CannotGenerate_ReturnsEmptyEnumerable() var analysis = new WordAnalysis(new IMorpheme[] { Entries["32"], edSuffix }, 0, "V"); Assert.That(morpher.GenerateWords(analysis), Is.Empty); } + + [Test] + public void TestMatchNodesWithPattern() + { + Morpher morpher = new Morpher(TraceManager, Language); + Feature feat1 = new StringFeature("1"); + Feature feat2 = new StringFeature("2"); + FeatureValue valueA = new StringFeatureValue("A"); + FeatureValue valueB = new StringFeatureValue("B"); + FeatureStruct fs1A = new FeatureStruct(); + FeatureStruct fs1B = new FeatureStruct(); + FeatureStruct fs2B = new FeatureStruct(); + fs1A.AddValue(feat1, valueA); + fs1B.AddValue(feat1, valueB); + fs2B.AddValue(feat2, valueB); + + // Test feature matching. + List nodesfs1A = new List { new ShapeNode(fs1A) }; + List nodesfs1B = new List { new ShapeNode(fs1B) }; + List nodesfs2B = new List { new ShapeNode(fs2B) }; + Assert.That(morpher.MatchNodesWithPattern(nodesfs1A, nodesfs1B), Is.Empty); + Assert.That(morpher.MatchNodesWithPattern(nodesfs1A, nodesfs1A), Is.EqualTo(new List> { nodesfs1A })); + var fs1A2B = morpher.MatchNodesWithPattern(nodesfs1A, nodesfs2B); + Assert.That(fs1A2B.ToList()[0][0].Annotation.FeatureStruct.GetValue(feat1).ToString(), Is.EqualTo(valueA.ToString())); + Assert.That(fs1A2B.ToList()[0][0].Annotation.FeatureStruct.GetValue(feat2).ToString(), Is.EqualTo(valueB.ToString())); + + List noNodes = new List { }; + List oneNode = new List { new ShapeNode(fs1A) }; + List twoNodes = new List { new ShapeNode(fs1A), new ShapeNode(fs1A) }; + List threeNodes = new List { new ShapeNode(fs1A), new ShapeNode(fs1A), new ShapeNode(fs1A) }; + List fourNodes = new List { new ShapeNode(fs1A), new ShapeNode(fs1A), new ShapeNode(fs1A), new ShapeNode(fs1A) }; + + // Test sequences. + Assert.That(morpher.MatchNodesWithPattern(twoNodes, twoNodes), Is.EquivalentTo(new List> { twoNodes })); + Assert.That(morpher.MatchNodesWithPattern(threeNodes, threeNodes), Is.EquivalentTo(new List> { threeNodes })); + + // Test optionality. + ShapeNode optionalNode = new ShapeNode(fs1A); + optionalNode.Annotation.Optional = true; + List optionalPattern = new List { optionalNode }; + Assert.That(morpher.MatchNodesWithPattern(noNodes, optionalPattern), Is.EquivalentTo(new List> { noNodes })); + Assert.That(morpher.MatchNodesWithPattern(oneNode, optionalPattern), Is.EquivalentTo(new List> { oneNode })); + Assert.That(morpher.MatchNodesWithPattern(twoNodes, optionalPattern), Is.Empty); + + // Test Kleene star. + ShapeNode starNode = new ShapeNode(fs1A); + starNode.Annotation.Optional = true; + starNode.Annotation.Iterative = true; + List starPattern = new List { starNode }; + Assert.That(morpher.MatchNodesWithPattern(noNodes, starPattern), Is.EquivalentTo(new List> { noNodes })); + var result = morpher.MatchNodesWithPattern(oneNode, starPattern); + Assert.That(morpher.MatchNodesWithPattern(oneNode, starPattern), Is.EquivalentTo(new List> { oneNode })); + Assert.That(morpher.MatchNodesWithPattern(twoNodes, starPattern), Is.EquivalentTo(new List> { twoNodes })); + + // Test Kleene plus. + ShapeNode plusNode = new ShapeNode(fs1A); + plusNode.Annotation.Iterative = true; + List plusPattern = new List { plusNode }; + Assert.That(morpher.MatchNodesWithPattern(noNodes, plusPattern), Is.Empty); + Assert.That(morpher.MatchNodesWithPattern(oneNode, plusPattern), Is.EquivalentTo(new List> { oneNode })); + Assert.That(morpher.MatchNodesWithPattern(twoNodes, plusPattern), Is.EquivalentTo(new List> { twoNodes })); + } }