From 3344aad72eda387c6265f8982e282052c25a835d Mon Sep 17 00:00:00 2001 From: John Maxwell Date: Mon, 9 Sep 2024 11:58:37 -0700 Subject: [PATCH] Add MaxUnapplications and GuessRoot --- .../AnalysisStratumRule.cs | 2 + .../Morpher.cs | 82 ++++++++++++++++++- .../MorpherTests.cs | 32 ++++++++ 3 files changed, 115 insertions(+), 1 deletion(-) diff --git a/src/SIL.Machine.Morphology.HermitCrab/AnalysisStratumRule.cs b/src/SIL.Machine.Morphology.HermitCrab/AnalysisStratumRule.cs index 88c06984e..97c31c2e4 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/AnalysisStratumRule.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/AnalysisStratumRule.cs @@ -90,6 +90,8 @@ public IEnumerable Apply(Word input) output.Add(mruleOutWord); if (_morpher.TraceManager.IsTracing) _morpher.TraceManager.EndUnapplyStratum(_stratum, mruleOutWord); + if (_morpher.MaxUnapplications > 0 && output.Count >= _morpher.MaxUnapplications) + break; } return output; } diff --git a/src/SIL.Machine.Morphology.HermitCrab/Morpher.cs b/src/SIL.Machine.Morphology.HermitCrab/Morpher.cs index 4e92c395c..f0af12e70 100644 --- a/src/SIL.Machine.Morphology.HermitCrab/Morpher.cs +++ b/src/SIL.Machine.Morphology.HermitCrab/Morpher.cs @@ -48,6 +48,8 @@ public Morpher(ITraceManager traceManager, Language lang) _analysisRule = lang.CompileAnalysisRule(this); _synthesisRule = lang.CompileSynthesisRule(this); MaxStemCount = 2; + MaxUnapplications = 0; + GuessRoot = false; LexEntrySelector = entry => true; RuleSelector = rule => true; @@ -63,6 +65,18 @@ public ITraceManager TraceManager public int MaxStemCount { get; set; } + /// + /// MaxUnapplications limits the number of unapplications to make it possible + /// to make it possible to debug words that take 30 minutes to parse + /// because there are too many unapplications. + /// + public int MaxUnapplications { get; set; } + + /// + /// When GuessRoot is true, guess LexEntries for the roots of the analyses. + /// + public bool GuessRoot { get; set; } + public Func LexEntrySelector { get; set; } public Func RuleSelector { get; set; } @@ -104,8 +118,31 @@ public IEnumerable ParseWord(string word, out object trace) File.WriteAllLines("analyses.txt", lines.OrderBy(l => l)); #endif + var origAnalyses = GuessRoot ? analyses.ToList() : null; + var syntheses = Synthesize(word, analyses); + if (GuessRoot && syntheses.Count() == 0) + { + // Guess roots when there are no results. + List matches = new List(); + foreach (Word analysisWord in origAnalyses) + { + var lexicalGuesses = LexicalGuess(analysisWord).Distinct(); + foreach (Word synthesisWord in lexicalGuesses) + { + foreach (Word validWord in _synthesisRule.Apply(synthesisWord).Where(IsWordValid)) + { + if (IsMatch(word, validWord)) + matches.Add(validWord); + } + } + } + + matches.Sort((x, y) => y.Morphs.Count().CompareTo(x.Morphs.Count())); + + return matches; + } + return syntheses; - return Synthesize(word, analyses); } /// @@ -309,6 +346,49 @@ LexEntry entry in SearchRootAllomorphs(input.Stratum, input.Shape) } } + private IEnumerable LexicalGuess(Word input) + { + if (_traceManager.IsTracing) + _traceManager.LexicalLookup(input.Stratum, input); + var table = input.Stratum.CharacterDefinitionTable; + var allRange = Range.Create(input.Shape.First, input.Shape.Last); + var shapeStrings = EnumerateShapeStrings(input.Shape.GetNodes(allRange).ToList(), 0, "", table); + foreach (string shapeString in shapeStrings) + { + var lexEntry = new LexEntry + { + Id = shapeString, + SyntacticFeatureStruct = input.SyntacticFeatureStruct, + Gloss = shapeString, + Stratum = input.Stratum, + IsPartial = input.SyntacticFeatureStruct.IsEmpty + }; + var root = new RootAllomorph(new Segments(table, shapeString)); + lexEntry.Allomorphs.Add(root); + Word newWord = input.Clone(); + newWord.RootAllomorph = root; + if (_traceManager.IsTracing) + _traceManager.SynthesizeWord(_lang, newWord); + newWord.Freeze(); + yield return newWord; + } + } + + IEnumerable EnumerateShapeStrings(IList nodes, int index, string prefix, CharacterDefinitionTable table) + { + if (index == nodes.Count) + { + return new List { prefix }; + } + string[] strReps = table.GetMatchingStrReps(nodes[index]).ToArray(); + List strings = new List(); + foreach (string strRep in strReps) + { + strings.AddRange(EnumerateShapeStrings(nodes, index + 1, prefix + strRep, table)); + } + return strings; + } + private bool IsWordValid(Word word) { if ( diff --git a/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs index 477096776..8e7e75033 100644 --- a/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs +++ b/tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs @@ -64,6 +64,38 @@ public void AnalyzeWord_CannotAnalyze_ReturnsEmptyEnumerable() Assert.That(morpher.AnalyzeWord("sagt"), Is.Empty); } + [Test] + public void AnalyzeWord_CanGuess_ReturnsCorrectAnalysis() + { + var any = FeatureStruct.New().Symbol(HCFeatureSystem.Segment).Value; + + var edSuffix = new AffixProcessRule + { + Id = "PAST", + Name = "ed_suffix", + Gloss = "PAST", + RequiredSyntacticFeatureStruct = FeatureStruct.New(Language.SyntacticFeatureSystem).Symbol("V").Value + }; + edSuffix.Allomorphs.Add( + new AffixProcessAllomorph + { + Lhs = { Pattern.New("1").Annotation(any).OneOrMore.Value }, + Rhs = { new CopyFromInput("1"), new InsertSegments(Table3, "+d") } + } + ); + Morphophonemic.MorphologicalRules.Add(edSuffix); + + var morpher = new Morpher(TraceManager, Language); + Assert.That(morpher.AnalyzeWord("gag"), Is.Empty); + Assert.That(morpher.AnalyzeWord("gagd"), Is.Empty); + + morpher.GuessRoot = true; + var analyses = morpher.AnalyzeWord("gag").ToList(); + Assert.That(analyses[0].ToString(), Is.EquivalentTo("[*gag]")); + var analyses2 = morpher.AnalyzeWord("gagd").ToList(); + Assert.That(analyses2[0].ToString(), Is.EquivalentTo("[*gag ed_suffix]")); + } + [Test] public void GenerateWords_CanGenerate_ReturnsCorrectWord() {