Skip to content

Commit

Permalink
Improve root guesser
Browse files Browse the repository at this point in the history
  • Loading branch information
jtmaxwell3 committed Sep 12, 2024
1 parent 3344aad commit 04d94fe
Show file tree
Hide file tree
Showing 4 changed files with 218 additions and 42 deletions.
150 changes: 113 additions & 37 deletions src/SIL.Machine.Morphology.HermitCrab/Morpher.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ public class Morpher : IMorphologicalAnalyzer, IMorphologicalGenerator
private readonly Dictionary<Stratum, RootAllomorphTrie> _allomorphTries;
private readonly ITraceManager _traceManager;
private readonly ReadOnlyObservableCollection<Morpheme> _morphemes;
private readonly IList<RootAllomorph> _lexicalPatterns = new List<RootAllomorph>();

public Morpher(ITraceManager traceManager, Language lang)
{
Expand All @@ -38,7 +39,12 @@ public Morpher(ITraceManager traceManager, Language lang)
var allomorphs = new HashSet<RootAllomorph>(stratum.Entries.SelectMany(entry => entry.Allomorphs));
var trie = new RootAllomorphTrie(ann => ann.Type() == HCFeatureSystem.Segment);
foreach (RootAllomorph allomorph in allomorphs)
trie.Add(allomorph);
{
if (allomorph.IsPattern)
_lexicalPatterns.Add(allomorph);
else
trie.Add(allomorph);
}
_allomorphTries[stratum] = trie;

morphemes.AddRange(stratum.Entries);
Expand All @@ -49,7 +55,6 @@ public Morpher(ITraceManager traceManager, Language lang)
_synthesisRule = lang.CompileSynthesisRule(this);
MaxStemCount = 2;
MaxUnapplications = 0;
GuessRoot = false;
LexEntrySelector = entry => true;
RuleSelector = rule => true;

Expand All @@ -72,11 +77,6 @@ public ITraceManager TraceManager
/// </summary>
public int MaxUnapplications { get; set; }

/// <summary>
/// When GuessRoot is true, guess LexEntries for the roots of the analyses.
/// </summary>
public bool GuessRoot { get; set; }

public Func<LexEntry, bool> LexEntrySelector { get; set; }
public Func<IHCRule, bool> RuleSelector { get; set; }

Expand All @@ -85,15 +85,29 @@ public Language Language
get { return _lang; }
}

public IList<RootAllomorph> LexicalPatterns
{
get { return _lexicalPatterns; }
}

/// <summary>
/// Parses the specified surface form.
/// </summary>
public IEnumerable<Word> ParseWord(string word)
{
return ParseWord(word, out _);
return ParseWord(word, out _, false);
}

public IEnumerable<Word> ParseWord(string word, out object trace)
{
return ParseWord(word, out trace, false);
}

/// <summary>
/// Parse the specified surface form, possibly tracing the parse.
/// If there are no analyses and guessRoot is true, then guess the root.
/// </summary>
public IEnumerable<Word> ParseWord(string word, out object trace, bool guessRoot)
{
// convert the word to its phonetic shape
Shape shape = _lang.SurfaceStratum.CharacterDefinitionTable.Segment(word);
Expand All @@ -118,9 +132,9 @@ public IEnumerable<Word> ParseWord(string word, out object trace)

File.WriteAllLines("analyses.txt", lines.OrderBy(l => l));
#endif
var origAnalyses = GuessRoot ? analyses.ToList() : null;
var origAnalyses = guessRoot ? analyses.ToList() : null;
var syntheses = Synthesize(word, analyses);
if (GuessRoot && syntheses.Count() == 0)
if (guessRoot && syntheses.Count() == 0)
{
// Guess roots when there are no results.
List<Word> matches = new List<Word>();
Expand Down Expand Up @@ -350,43 +364,93 @@ private IEnumerable<Word> LexicalGuess(Word input)
{
if (_traceManager.IsTracing)
_traceManager.LexicalLookup(input.Stratum, input);
var table = input.Stratum.CharacterDefinitionTable;
var allRange = Range<ShapeNode>.Create(input.Shape.First, input.Shape.Last);
var shapeStrings = EnumerateShapeStrings(input.Shape.GetNodes(allRange).ToList(), 0, "", table);
foreach (string shapeString in shapeStrings)
CharacterDefinitionTable table = input.Stratum.CharacterDefinitionTable;
IEnumerable<ShapeNode> shapeNodes = input.Shape.GetNodes(input.Range);
foreach (RootAllomorph lexicalPattern in _lexicalPatterns)
{
var lexEntry = new LexEntry
IEnumerable<ShapeNode> shapePattern = lexicalPattern.Segments.Shape.GetNodes(lexicalPattern.Segments.Shape.Range);
foreach (List<ShapeNode> match in MatchNodesWithPattern(shapeNodes.ToList(), shapePattern.ToList()))
{
Id = shapeString,
SyntacticFeatureStruct = input.SyntacticFeatureStruct,
Gloss = shapeString,
Stratum = input.Stratum,
IsPartial = input.SyntacticFeatureStruct.IsEmpty
};
var root = new RootAllomorph(new Segments(table, shapeString));
lexEntry.Allomorphs.Add(root);
Word newWord = input.Clone();
newWord.RootAllomorph = root;
if (_traceManager.IsTracing)
_traceManager.SynthesizeWord(_lang, newWord);
newWord.Freeze();
yield return newWord;
// Create a root allomorph for the guess.
string shapeString = match.ToString(table, false);
var root = new RootAllomorph(new Segments(table, shapeString))
{
Guessed = true
};
// Point the root allomorph to the lexical pattern in FieldWorks.
if (lexicalPattern.Properties.ContainsKey("ID"))
root.Properties["ID"] = lexicalPattern.Properties["ID"];
if (lexicalPattern.Morpheme != null && lexicalPattern.Morpheme.Properties.ContainsKey("ID"))
root.Morpheme.Properties["ID"] = lexicalPattern.Morpheme.Properties["ID"];
// Create a lexical entry to hold the root allomorph.
// (The root allmorph will point to the lexical entry.)
var lexEntry = new LexEntry
{
Id = shapeString,
SyntacticFeatureStruct = input.SyntacticFeatureStruct,
Gloss = shapeString,
Stratum = input.Stratum,
IsPartial = input.SyntacticFeatureStruct.IsEmpty
};
lexEntry.Allomorphs.Add(root);
// Create a new word that uses the root allomorph.
Word newWord = input.Clone();
newWord.RootAllomorph = root;
if (_traceManager.IsTracing)
_traceManager.SynthesizeWord(_lang, newWord);
newWord.Freeze();
yield return newWord;
}
}
}

IEnumerable<string> EnumerateShapeStrings(IList<ShapeNode> nodes, int index, string prefix, CharacterDefinitionTable table)
public IEnumerable<List<ShapeNode>> MatchNodesWithPattern(IList<ShapeNode> nodes, IList<ShapeNode> pattern,
int n = 0, int p = 0, bool obligatory = false, List<ShapeNode> prefix = null)
{
if (index == nodes.Count)
var results = new List<List<ShapeNode>>();
if (prefix == null)
prefix = new List<ShapeNode>();
if (pattern.Count() == p)
{
return new List<string> { prefix };
if (nodes.Count() == n)
// We match because we are at the end of both the pattern and the nodes.
results.Add(prefix);
return results;
}
string[] strReps = table.GetMatchingStrReps(nodes[index]).ToArray();
List<string> strings = new List<string>();
foreach (string strRep in strReps)
if (pattern[p].Annotation.Optional && !obligatory)
// Try skipping this item in the pattern.
results.AddRange(MatchNodesWithPattern(nodes, pattern, n, p + 1, false, prefix));
if (nodes.Count() == n)
{
strings.AddRange(EnumerateShapeStrings(nodes, index + 1, prefix + strRep, table));
// We fail to match because we are at the end of the nodes but not the pattern.
return results;
}
return strings;
ShapeNode newNode = UnifyShapeNodes(nodes[n], pattern[p]);
if (newNode == null)
// We fail because the pattern didn't match the node here.
return results;
// Make a copy of prefix to avoid crosstalk and add newNode.
prefix = new List<ShapeNode>(prefix)
{
newNode
};
if (pattern[p].Annotation.Iterative)
// Try using this item in the pattern again.
results.AddRange(MatchNodesWithPattern(nodes, pattern, n + 1, p, true, prefix));
// Try the remainder of the nodes against the remainder of the pattern.
results.AddRange(MatchNodesWithPattern(nodes, pattern, n + 1, p + 1, false, prefix));
return results;
}

ShapeNode UnifyShapeNodes(ShapeNode node, ShapeNode pattern)
{
FeatureStruct fs = null;
node.Annotation.FeatureStruct.Unify(pattern.Annotation.FeatureStruct, out fs);
if (fs == null)
return null;
if (fs.ValueEquals(node.Annotation.FeatureStruct))
return node;
return new ShapeNode(fs);
}

private bool IsWordValid(Word word)
Expand Down Expand Up @@ -459,6 +523,18 @@ public IEnumerable<WordAnalysis> AnalyzeWord(string word)
}
}

public IEnumerable<WordAnalysis> AnalyzeWord(string word, bool guessRoot)
{
try
{
return ParseWord(word, out _, guessRoot).Select(CreateWordAnalysis);
}
catch (InvalidShapeException)
{
return Enumerable.Empty<WordAnalysis>();
}
}

private WordAnalysis CreateWordAnalysis(Word result)
{
int rootMorphemeIndex = -1;
Expand Down
10 changes: 10 additions & 0 deletions src/SIL.Machine.Morphology.HermitCrab/RootAllomorph.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@ public Segments Segments

public bool IsBound { get; set; }

/// <summary>
/// Does this represent a lexical pattern (e.g. [Seg]+)?
/// </summary>
public bool IsPattern { get; set; }

/// <summary>
/// Was this allomorph guessed by a lexical pattern?
/// </summary>
public bool Guessed { get; set; }

protected override bool ConstraintsEqual(Allomorph other)
{
if (!(other is RootAllomorph otherAllo))
Expand Down
22 changes: 21 additions & 1 deletion src/SIL.Machine/Annotations/Annotation.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ public class Annotation<TOffset>
private int _hashCode;
private FeatureStruct _fs;
private bool _optional;
private bool _iterative;
private object _data;

public Annotation(Range<TOffset> range, FeatureStruct fs)
Expand All @@ -40,6 +41,7 @@ protected Annotation(Annotation<TOffset> ann)
: this(ann.Range, ann.FeatureStruct.Clone())
{
Optional = ann.Optional;
Iterative = ann.Iterative;
_data = ann._data;
if (ann._children != null && ann._children.Count > 0)
Children.AddRange(ann.Children.Select(node => node.Clone()));
Expand Down Expand Up @@ -128,6 +130,23 @@ public bool Optional
}
}

/// <summary>
/// Gets or sets a value indicating whether this annotation is iterative.
/// This is used in lexical patterns such as [Seg]+:
/// Kleene star = iterative and optional, Kleene plus = iterative and not optional.
/// </summary>
/// <value>
/// <c>true</c> if this annotation is iterative, otherwise <c>false</c>.
/// </value>
public bool Iterative
{
get { return _iterative; }
set
{
CheckFrozen();
_iterative = value;
}
}
internal int ListID { get; set; }

public bool Remove(bool preserveChildren)
Expand Down Expand Up @@ -188,6 +207,7 @@ public void Freeze()
_hashCode = _hashCode * 31 + _fs.GetFrozenHashCode();
_hashCode = _hashCode * 31 + (_children == null ? 0 : _children.GetFrozenHashCode());
_hashCode = _hashCode * 31 + _optional.GetHashCode();
_hashCode = _hashCode * 31 + _iterative.GetHashCode();
_hashCode = _hashCode * 31 + Range.GetHashCode();
}

Expand All @@ -202,7 +222,7 @@ public bool ValueEquals(Annotation<TOffset> other)
if (!IsLeaf && !_children.ValueEquals(other._children))
return false;

return _fs.ValueEquals(other._fs) && _optional == other._optional && Range == other.Range;
return _fs.ValueEquals(other._fs) && _optional == other._optional && _iterative == other._iterative && Range == other.Range;
}

public int GetFrozenHashCode()
Expand Down
78 changes: 74 additions & 4 deletions tests/SIL.Machine.Morphology.HermitCrab.Tests/MorpherTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,22 @@ public void AnalyzeWord_CanGuess_ReturnsCorrectAnalysis()
);
Morphophonemic.MorphologicalRules.Add(edSuffix);

// Make a lexical pattern equivalent to Any+.
ShapeNode node = new ShapeNode(new FeatureStruct());
node.Annotation.Optional = true;
node.Annotation.Iterative = true;
var shape = new Shape(begin => new ShapeNode(begin ? HCFeatureSystem.LeftSideAnchor : HCFeatureSystem.RightSideAnchor));
shape.AddRange(new List<ShapeNode> { node });
var lexicalPattern = new RootAllomorph(new Segments(Table1, "", shape));

var morpher = new Morpher(TraceManager, Language);
morpher.LexicalPatterns.Add(lexicalPattern);

Assert.That(morpher.AnalyzeWord("gag"), Is.Empty);
Assert.That(morpher.AnalyzeWord("gagd"), Is.Empty);

morpher.GuessRoot = true;
var analyses = morpher.AnalyzeWord("gag").ToList();
var analyses = morpher.AnalyzeWord("gag", true).ToList();
Assert.That(analyses[0].ToString(), Is.EquivalentTo("[*gag]"));
var analyses2 = morpher.AnalyzeWord("gagd").ToList();
var analyses2 = morpher.AnalyzeWord("gagd", true).ToList();
Assert.That(analyses2[0].ToString(), Is.EquivalentTo("[*gag ed_suffix]"));
}

Expand Down Expand Up @@ -167,4 +175,66 @@ public void GenerateWords_CannotGenerate_ReturnsEmptyEnumerable()
var analysis = new WordAnalysis(new IMorpheme[] { Entries["32"], edSuffix }, 0, "V");
Assert.That(morpher.GenerateWords(analysis), Is.Empty);
}

[Test]
public void TestMatchNodesWithPattern()
{
Morpher morpher = new Morpher(TraceManager, Language);
Feature feat1 = new StringFeature("1");
Feature feat2 = new StringFeature("2");
FeatureValue valueA = new StringFeatureValue("A");
FeatureValue valueB = new StringFeatureValue("B");
FeatureStruct fs1A = new FeatureStruct();
FeatureStruct fs1B = new FeatureStruct();
FeatureStruct fs2B = new FeatureStruct();
fs1A.AddValue(feat1, valueA);
fs1B.AddValue(feat1, valueB);
fs2B.AddValue(feat2, valueB);

// Test feature matching.
List<ShapeNode> nodesfs1A = new List<ShapeNode> { new ShapeNode(fs1A) };
List<ShapeNode> nodesfs1B = new List<ShapeNode> { new ShapeNode(fs1B) };
List<ShapeNode> nodesfs2B = new List<ShapeNode> { new ShapeNode(fs2B) };
Assert.That(morpher.MatchNodesWithPattern(nodesfs1A, nodesfs1B), Is.Empty);
Assert.That(morpher.MatchNodesWithPattern(nodesfs1A, nodesfs1A), Is.EqualTo(new List<List<ShapeNode>> { nodesfs1A }));
var fs1A2B = morpher.MatchNodesWithPattern(nodesfs1A, nodesfs2B);
Assert.That(fs1A2B.ToList()[0][0].Annotation.FeatureStruct.GetValue(feat1).ToString(), Is.EqualTo(valueA.ToString()));
Assert.That(fs1A2B.ToList()[0][0].Annotation.FeatureStruct.GetValue(feat2).ToString(), Is.EqualTo(valueB.ToString()));

List<ShapeNode> noNodes = new List<ShapeNode> { };
List<ShapeNode> oneNode = new List<ShapeNode> { new ShapeNode(fs1A) };
List<ShapeNode> twoNodes = new List<ShapeNode> { new ShapeNode(fs1A), new ShapeNode(fs1A) };
List<ShapeNode> threeNodes = new List<ShapeNode> { new ShapeNode(fs1A), new ShapeNode(fs1A), new ShapeNode(fs1A) };
List<ShapeNode> fourNodes = new List<ShapeNode> { new ShapeNode(fs1A), new ShapeNode(fs1A), new ShapeNode(fs1A), new ShapeNode(fs1A) };

// Test sequences.
Assert.That(morpher.MatchNodesWithPattern(twoNodes, twoNodes), Is.EquivalentTo(new List<List<ShapeNode>> { twoNodes }));
Assert.That(morpher.MatchNodesWithPattern(threeNodes, threeNodes), Is.EquivalentTo(new List<List<ShapeNode>> { threeNodes }));

// Test optionality.
ShapeNode optionalNode = new ShapeNode(fs1A);
optionalNode.Annotation.Optional = true;
List<ShapeNode> optionalPattern = new List<ShapeNode> { optionalNode };
Assert.That(morpher.MatchNodesWithPattern(noNodes, optionalPattern), Is.EquivalentTo(new List<List<ShapeNode>> { noNodes }));
Assert.That(morpher.MatchNodesWithPattern(oneNode, optionalPattern), Is.EquivalentTo(new List<List<ShapeNode>> { oneNode }));
Assert.That(morpher.MatchNodesWithPattern(twoNodes, optionalPattern), Is.Empty);

// Test Kleene star.
ShapeNode starNode = new ShapeNode(fs1A);
starNode.Annotation.Optional = true;
starNode.Annotation.Iterative = true;
List<ShapeNode> starPattern = new List<ShapeNode> { starNode };
Assert.That(morpher.MatchNodesWithPattern(noNodes, starPattern), Is.EquivalentTo(new List<List<ShapeNode>> { noNodes }));
var result = morpher.MatchNodesWithPattern(oneNode, starPattern);
Assert.That(morpher.MatchNodesWithPattern(oneNode, starPattern), Is.EquivalentTo(new List<List<ShapeNode>> { oneNode }));
Assert.That(morpher.MatchNodesWithPattern(twoNodes, starPattern), Is.EquivalentTo(new List<List<ShapeNode>> { twoNodes }));

// Test Kleene plus.
ShapeNode plusNode = new ShapeNode(fs1A);
plusNode.Annotation.Iterative = true;
List<ShapeNode> plusPattern = new List<ShapeNode> { plusNode };
Assert.That(morpher.MatchNodesWithPattern(noNodes, plusPattern), Is.Empty);
Assert.That(morpher.MatchNodesWithPattern(oneNode, plusPattern), Is.EquivalentTo(new List<List<ShapeNode>> { oneNode }));
Assert.That(morpher.MatchNodesWithPattern(twoNodes, plusPattern), Is.EquivalentTo(new List<List<ShapeNode>> { twoNodes }));
}
}

0 comments on commit 04d94fe

Please sign in to comment.