From e0fba037288b65b2afec3ae46e151fa124935512 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Wed, 23 Oct 2024 10:42:11 -0400 Subject: [PATCH 01/26] broken --- .../Corpora/CorpusAlignmentException.cs | 5 + .../Corpora/INParallelTextCorpus.cs | 11 + .../Corpora/NParallelTextCorpus.cs | 658 ++++++++++++++++++ .../Corpora/NParallelTextCorpusBase.cs | 36 + src/SIL.Machine/Corpora/NParallelTextRow.cs | 54 ++ .../Corpora/ParallelCorpusEnumerator.cs | 126 ++++ 6 files changed, 890 insertions(+) create mode 100644 src/SIL.Machine/Corpora/INParallelTextCorpus.cs create mode 100644 src/SIL.Machine/Corpora/NParallelTextCorpus.cs create mode 100644 src/SIL.Machine/Corpora/NParallelTextCorpusBase.cs create mode 100644 src/SIL.Machine/Corpora/NParallelTextRow.cs create mode 100644 src/SIL.Machine/Corpora/ParallelCorpusEnumerator.cs diff --git a/src/SIL.Machine/Corpora/CorpusAlignmentException.cs b/src/SIL.Machine/Corpora/CorpusAlignmentException.cs index c86dd8cfd..2b8129858 100644 --- a/src/SIL.Machine/Corpora/CorpusAlignmentException.cs +++ b/src/SIL.Machine/Corpora/CorpusAlignmentException.cs @@ -8,5 +8,10 @@ public CorpusAlignmentException(string sourceRef, string targetRef) : base( $"Invalid format in {sourceRef} and {targetRef}. Mismatched key formats \"{sourceRef}\" and \"{targetRef}\". There may be an extraneous tab, missing ref, or inconsistent use of user-defined refs." ) { } + + public CorpusAlignmentException(string[] refs) + : base( + $"Invalid format in {string.Join(", ", refs)}. Mismatched key formats. There may be an extraneous tab, missing ref, or inconsistent use of user-defined refs." + ) { } } } diff --git a/src/SIL.Machine/Corpora/INParallelTextCorpus.cs b/src/SIL.Machine/Corpora/INParallelTextCorpus.cs new file mode 100644 index 000000000..5a1e86f76 --- /dev/null +++ b/src/SIL.Machine/Corpora/INParallelTextCorpus.cs @@ -0,0 +1,11 @@ +using System.Collections.Generic; + +namespace SIL.Machine.Corpora +{ + public interface INParallelTextCorpus : ICorpus + { + int Count(bool includeEmpty = true, IEnumerable textIds = null); + + IEnumerable GetRows(IEnumerable textIds); + } +} diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs new file mode 100644 index 000000000..e38d12383 --- /dev/null +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -0,0 +1,658 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Linq; +using SIL.Linq; +using SIL.ObjectModel; +using SIL.Scripture; + +namespace SIL.Machine.Corpora +{ + public class NParallelTextCorpus : NParallelTextCorpusBase + { + public NParallelTextCorpus(IEnumerable corpora, IComparer rowRefComparer = null) + { + Corpora = corpora.ToImmutableArray(); + if (Corpora.Count < 1) + throw new ArgumentException("There must be at least one corpora.", nameof(corpora)); + RowRefComparer = rowRefComparer ?? new DefaultRowRefComparer(); + AllRowsList = new bool[Corpora.Count] + .Select(_ => false) + .ToImmutableArray(); + } + + public bool GetIsTokenized(int i) => + i < Corpora.Count ? Corpora[i].IsTokenized : throw new ArgumentOutOfRangeException(nameof(i)); + + public int N => Corpora.Count; + + public IReadOnlyList AllRowsList { get; set; } + public IReadOnlyList Corpora { get; } + public IComparer RowRefComparer { get; } + + private static HashSet GetTextIdsFromCorpora( + IEnumerable corpora, + IEnumerable allRowsEnumerate + ) + { + IReadOnlyList> textIdListOfLists = corpora + .Select(c => c.Texts.Select(t => t.Id)) + .ToImmutableArray(); + + HashSet textIds = textIdListOfLists + .Skip(1) + .Aggregate( + new HashSet(textIdListOfLists.First()), + (h, e) => + { + h.IntersectWith(e); + return h; + } + ); + allRowsEnumerate + .Select((allRows, i) => (allRows, i)) + .Where(t => t.allRows) + .ForEach(t => textIds.UnionWith(textIdListOfLists[t.i])); + return textIds; + } + + public override IEnumerable GetRows(IEnumerable textIds) + { + HashSet filterTextIds = GetTextIdsFromCorpora(Corpora, AllRowsList); + + if (textIds != null) + filterTextIds.IntersectWith(textIds); + + IList> enumeratedCorpora = new List>(); + try + { + for (int i = 0; i < Corpora.Count; i++) + { + if (i == 0) + { + enumeratedCorpora.Add(Corpora[0].GetRows(filterTextIds).GetEnumerator()); + } + else + { + enumeratedCorpora.Add( + new ParallelCorpusEnumerator( + Corpora[i].GetRows(filterTextIds).GetEnumerator(), + Corpora[0].Versification, + Corpora[i].Versification + ) + ); + } + } + return GetRows(enumeratedCorpora); + } + finally + { + foreach (IEnumerator enumerator in enumeratedCorpora) + { + enumerator.Dispose(); + } + } + } + + private IList MinRefIndexes(IList refs) + { + object minRef = refs[0]; + IList minRefIndexes = new List(0); + for (int i = 1; i < refs.Count; i++) + { + if (RowRefComparer.Compare(refs[i], minRef) < 0) + { + minRef = refs[i]; + minRefIndexes.Clear(); + minRefIndexes.Add(i); + } + else if (RowRefComparer.Compare(refs[i], minRef) == 0) + { + minRefIndexes.Add(i); + } + } + return minRefIndexes; + } + + private IEnumerable GetRows(IList> enumerators) + { + { + var rangeInfo = new NRangeInfo { Versification = Corpora[0].Versification }; + + List[] sameRefRows = new List[Corpora.Count]; + bool[] completed = enumerators.Select(e => !e.MoveNext()).ToArray(); + + while (!completed.Any()) + { + IList minRefIndexes; + IList currentRefs = enumerators.Select(e => e.Current.Ref).ToArray(); + try + { + minRefIndexes = MinRefIndexes(currentRefs); + } + catch (ArgumentException) + { + throw new CorpusAlignmentException(currentRefs.Select(r => r.ToString()).ToArray()); + } + if (minRefIndexes.Count == N) + { + if ( + (!AllTargetRows && srcEnumerator.Current.IsInRange) + || (!AllSourceRows && trgEnumerator.Current.IsInRange) + ) + { + if ( + rangeInfo.IsInRange + && ( + ( + srcEnumerator.Current.IsInRange + && !trgEnumerator.Current.IsInRange + && srcEnumerator.Current.Segment.Count > 0 + ) + || ( + !srcEnumerator.Current.IsInRange + && trgEnumerator.Current.IsInRange + && trgEnumerator.Current.Segment.Count > 0 + ) + || ( + srcEnumerator.Current.IsInRange + && trgEnumerator.Current.IsInRange + && srcEnumerator.Current.Segment.Count > 0 + && trgEnumerator.Current.Segment.Count > 0 + ) + ) + ) + { + yield return rangeInfo.CreateRow(); + } + + rangeInfo.TextId = srcEnumerator.Current.TextId; + rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); + rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); + sourceSameRefRows.Clear(); + targetSameRefRows.Clear(); + if (rangeInfo.IsSourceEmpty) + rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; + if (rangeInfo.IsTargetEmpty) + rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; + rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); + rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); + } + else + { + if (CheckSameRefRows(sourceSameRefRows, trgEnumerator.Current)) + { + foreach (TextRow prevSourceRow in sourceSameRefRows) + { + foreach ( + ParallelTextRow row in CreateRows( + rangeInfo, + prevSourceRow, + trgEnumerator.Current + ) + ) + { + yield return row; + } + } + } + + if (CheckSameRefRows(targetSameRefRows, srcEnumerator.Current)) + { + foreach (TextRow prevTargetRow in targetSameRefRows) + { + foreach ( + ParallelTextRow row in CreateRows( + rangeInfo, + srcEnumerator.Current, + prevTargetRow + ) + ) + { + yield return row; + } + } + } + + foreach ( + ParallelTextRow row in CreateRows( + rangeInfo, + srcEnumerator.Current, + trgEnumerator.Current, + compare2 == 0 ? alignmentEnumerator.Current.AlignedWordPairs : null + ) + ) + { + yield return row; + } + } + + sourceSameRefRows.Add(srcEnumerator.Current); + srcCompleted = !srcEnumerator.MoveNext(); + + targetSameRefRows.Add(trgEnumerator.Current); + trgCompleted = !trgEnumerator.MoveNext(); + } + if (compare < 0) + { + if (!AllTargetRows && srcEnumerator.Current.IsInRange) + { + if ( + rangeInfo.IsInRange + && trgEnumerator.Current.IsInRange + && trgEnumerator.Current.Segment.Count > 0 + ) + { + yield return rangeInfo.CreateRow(); + } + rangeInfo.TextId = srcEnumerator.Current.TextId; + rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); + targetSameRefRows.Clear(); + if (rangeInfo.IsSourceEmpty) + rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; + rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); + } + else + { + foreach ( + ParallelTextRow row in CreateSourceRows( + rangeInfo, + srcEnumerator.Current, + targetSameRefRows, + forceTargetInRange: srcEnumerator.Current.TextId == trgEnumerator.Current.TextId + && !trgEnumerator.Current.IsRangeStart + && trgEnumerator.Current.IsInRange + ) + ) + { + yield return row; + } + } + + sourceSameRefRows.Add(srcEnumerator.Current); + srcCompleted = !srcEnumerator.MoveNext(); + } + else if (compare > 0) + { + if (!AllSourceRows && trgEnumerator.Current.IsInRange) + { + if ( + rangeInfo.IsInRange + && srcEnumerator.Current.IsInRange + && srcEnumerator.Current.Segment.Count > 0 + ) + { + yield return rangeInfo.CreateRow(); + } + rangeInfo.TextId = trgEnumerator.Current.TextId; + rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); + sourceSameRefRows.Clear(); + if (rangeInfo.IsTargetEmpty) + rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; + rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); + } + else + { + foreach ( + ParallelTextRow row in CreateTargetRows( + rangeInfo, + trgEnumerator.Current, + sourceSameRefRows, + forceSourceInRange: trgEnumerator.Current.TextId == srcEnumerator.Current.TextId + && !srcEnumerator.Current.IsRangeStart + && srcEnumerator.Current.IsInRange + ) + ) + { + yield return row; + } + } + + targetSameRefRows.Add(trgEnumerator.Current); + trgCompleted = !trgEnumerator.MoveNext(); + } + else + // compare == 0 - the refs are the same + { + if ( + (!AllTargetRows && srcEnumerator.Current.IsInRange) + || (!AllSourceRows && trgEnumerator.Current.IsInRange) + ) + { + if ( + rangeInfo.IsInRange + && ( + ( + srcEnumerator.Current.IsInRange + && !trgEnumerator.Current.IsInRange + && srcEnumerator.Current.Segment.Count > 0 + ) + || ( + !srcEnumerator.Current.IsInRange + && trgEnumerator.Current.IsInRange + && trgEnumerator.Current.Segment.Count > 0 + ) + || ( + srcEnumerator.Current.IsInRange + && trgEnumerator.Current.IsInRange + && srcEnumerator.Current.Segment.Count > 0 + && trgEnumerator.Current.Segment.Count > 0 + ) + ) + ) + { + yield return rangeInfo.CreateRow(); + } + + rangeInfo.TextId = srcEnumerator.Current.TextId; + rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); + rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); + sourceSameRefRows.Clear(); + targetSameRefRows.Clear(); + if (rangeInfo.IsSourceEmpty) + rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; + if (rangeInfo.IsTargetEmpty) + rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; + rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); + rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); + } + else + { + if (CheckSameRefRows(sourceSameRefRows, trgEnumerator.Current)) + { + foreach (TextRow prevSourceRow in sourceSameRefRows) + { + foreach ( + ParallelTextRow row in CreateRows( + rangeInfo, + prevSourceRow, + trgEnumerator.Current + ) + ) + { + yield return row; + } + } + } + + if (CheckSameRefRows(targetSameRefRows, srcEnumerator.Current)) + { + foreach (TextRow prevTargetRow in targetSameRefRows) + { + foreach ( + ParallelTextRow row in CreateRows( + rangeInfo, + srcEnumerator.Current, + prevTargetRow + ) + ) + { + yield return row; + } + } + } + + foreach ( + ParallelTextRow row in CreateRows( + rangeInfo, + srcEnumerator.Current, + trgEnumerator.Current, + compare2 == 0 ? alignmentEnumerator.Current.AlignedWordPairs : null + ) + ) + { + yield return row; + } + } + + sourceSameRefRows.Add(srcEnumerator.Current); + srcCompleted = !srcEnumerator.MoveNext(); + + targetSameRefRows.Add(trgEnumerator.Current); + trgCompleted = !trgEnumerator.MoveNext(); + } + } + + while (!srcCompleted) + { + if (!AllTargetRows && srcEnumerator.Current.IsInRange) + { + rangeInfo.TextId = srcEnumerator.Current.TextId; + rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); + targetSameRefRows.Clear(); + if (rangeInfo.IsSourceEmpty) + rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; + rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); + } + else + { + foreach ( + ParallelTextRow row in CreateSourceRows(rangeInfo, srcEnumerator.Current, targetSameRefRows) + ) + { + yield return row; + } + } + srcCompleted = !srcEnumerator.MoveNext(); + } + + while (!trgCompleted) + { + if (!AllSourceRows && trgEnumerator.Current.IsInRange) + { + rangeInfo.TextId = trgEnumerator.Current.TextId; + rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); + sourceSameRefRows.Clear(); + if (rangeInfo.IsTargetEmpty) + rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; + rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); + } + else + { + foreach ( + ParallelTextRow row in CreateTargetRows(rangeInfo, trgEnumerator.Current, sourceSameRefRows) + ) + { + yield return row; + } + } + trgCompleted = !trgEnumerator.MoveNext(); + } + + if (rangeInfo.IsInRange) + yield return rangeInfo.CreateRow(); + } + } + + private IEnumerable CreateRows( + RangeInfo rangeInfo, + TextRow srcRow, + TextRow trgRow, + IReadOnlyCollection alignedWordPairs = null, + bool forceSourceInRange = false, + bool forceTargetInRange = false + ) + { + if (rangeInfo.IsInRange) + yield return rangeInfo.CreateRow(); + + string textId; + if (srcRow != null) + textId = srcRow.TextId; + else if (trgRow != null) + textId = trgRow.TextId; + else + throw new ArgumentNullException("Either a source or target must be specified."); + + object[] sourceRefs = srcRow != null ? new object[] { srcRow.Ref } : Array.Empty(); + object[] targetRefs = trgRow != null ? new object[] { trgRow.Ref } : Array.Empty(); + if (targetRefs.Length == 0 && TargetCorpus.IsScripture()) + { + targetRefs = sourceRefs + .Cast() + .Select(r => r.ChangeVersification(TargetCorpus.Versification)) + .Cast() + .ToArray(); + } + + TextRowFlags sourceFlags; + if (srcRow == null) + sourceFlags = forceSourceInRange ? TextRowFlags.InRange : TextRowFlags.None; + else + sourceFlags = srcRow.Flags; + + TextRowFlags targetFlags; + if (trgRow == null) + targetFlags = forceTargetInRange ? TextRowFlags.InRange : TextRowFlags.None; + else + targetFlags = trgRow.Flags; + + yield return new ParallelTextRow(textId, sourceRefs, targetRefs) + { + SourceSegment = srcRow != null ? srcRow.Segment : Array.Empty(), + TargetSegment = trgRow != null ? trgRow.Segment : Array.Empty(), + AlignedWordPairs = alignedWordPairs, + SourceFlags = sourceFlags, + TargetFlags = targetFlags + }; + } + + private bool CheckSameRefRows(List sameRefRows, TextRow otherRow) + { + try + { + if (sameRefRows.Count > 0 && RowRefComparer.Compare(sameRefRows[0].Ref, otherRow.Ref) != 0) + sameRefRows.Clear(); + } + catch (ArgumentException) + { + throw new CorpusAlignmentException(sameRefRows[0].Ref.ToString(), otherRow.Ref.ToString()); + } + return sameRefRows.Count > 0; + } + + private IEnumerable CreateSourceRows( + RangeInfo rangeInfo, + TextRow sourceRow, + List targetSameRefRows, + bool forceTargetInRange = false + ) + { + if (CheckSameRefRows(targetSameRefRows, sourceRow)) + { + foreach (TextRow targetSameRefRow in targetSameRefRows) + { + foreach (ParallelTextRow row in CreateRows(rangeInfo, sourceRow, targetSameRefRow)) + yield return row; + } + } + else if (AllSourceRows) + { + foreach ( + ParallelTextRow row in CreateRows( + rangeInfo, + sourceRow, + null, + forceTargetInRange: forceTargetInRange + ) + ) + { + yield return row; + } + } + } + + private IEnumerable CreateTargetRows( + RangeInfo rangeInfo, + TextRow targetRow, + List sourceSameRefRows, + bool forceSourceInRange = false + ) + { + if (CheckSameRefRows(sourceSameRefRows, targetRow)) + { + foreach (TextRow sourceSameRefRow in sourceSameRefRows) + { + foreach (ParallelTextRow row in CreateRows(rangeInfo, sourceSameRefRow, targetRow)) + yield return row; + } + } + else if (AllTargetRows) + { + foreach ( + ParallelTextRow row in CreateRows( + rangeInfo, + null, + targetRow, + forceSourceInRange: forceSourceInRange + ) + ) + { + yield return row; + } + } + } + + private class RangeRow + { + public List Refs { get; } = new List(); + public List Segment { get; } = new List(); + public bool IsSentenceStart { get; set; } = false; + public bool IsInRange => Refs.Count > 0; + public bool IsEmpty => Segment.Count == 0; + } + + private class NRangeInfo + { + public int N = -1; + public string TextId { get; set; } = ""; + public ScrVers Versification { get; set; } = null; + public List Rows { get; } = new List(); + public bool IsInRange => Rows.Any(r => r.IsInRange); + + public NParallelTextRow CreateRow() + { + object[] refs = new object[0]; + foreach (RangeRow cRow in Rows) + { + if (refs.Count() == 0 && Versification != null) + { + refs = cRow + .Refs.ToArray() + .Cast() + .Select(r => r.ChangeVersification(Versification)) + .Cast() + .ToArray(); + } + } + var nParRow = new NParallelTextRow(TextId, Rows.Select(r => r.Refs).ToArray()) + { + Segments = Rows.Select(r => r.Segment.ToArray()).ToArray(), + Flags = Rows.Select(r => r.IsSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None) + .ToArray() + }; + TextId = ""; + foreach (RangeRow r in Rows) + { + r.Refs.Clear(); + r.Segment.Clear(); + r.IsSentenceStart = false; + } + return nParRow; + } + } + + private class DefaultRowRefComparer : IComparer + { + public int Compare(object x, object y) + { + // Do not use the default comparer for ScriptureRef, since we want to ignore segments + if (x is ScriptureRef sx && y is ScriptureRef sy) + return sx.CompareTo(sy, compareSegments: false); + + return Comparer.Default.Compare(x, y); + } + } + } +} diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpusBase.cs b/src/SIL.Machine/Corpora/NParallelTextCorpusBase.cs new file mode 100644 index 000000000..5487b2001 --- /dev/null +++ b/src/SIL.Machine/Corpora/NParallelTextCorpusBase.cs @@ -0,0 +1,36 @@ +using System.Collections; +using System.Collections.Generic; +using System.Linq; + +namespace SIL.Machine.Corpora +{ + public abstract class NParallelTextCorpusBase : INParallelTextCorpus + { + int ICorpus.Count(bool includeEmpty) + { + return Count(includeEmpty, null); + } + + public virtual int Count(bool includeEmpty = true, IEnumerable textIds = null) + { + return includeEmpty ? GetRows(textIds).Count() : GetRows(textIds).Count(r => !r.IsEmpty); + } + + public IEnumerable GetRows() + { + return GetRows(null); + } + + public abstract IEnumerable GetRows(IEnumerable textIds); + + public IEnumerator GetEnumerator() + { + return GetRows().GetEnumerator(); + } + + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + } +} diff --git a/src/SIL.Machine/Corpora/NParallelTextRow.cs b/src/SIL.Machine/Corpora/NParallelTextRow.cs new file mode 100644 index 000000000..3035be330 --- /dev/null +++ b/src/SIL.Machine/Corpora/NParallelTextRow.cs @@ -0,0 +1,54 @@ +using System; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Linq; +using SIL.Extensions; + +namespace SIL.Machine.Corpora +{ + public class NParallelTextRow : IRow + { + public NParallelTextRow(string textId, IEnumerable> nRefs) + { + if (string.IsNullOrEmpty(textId)) + throw new ArgumentNullException(nameof(textId)); + + if (nRefs.SelectMany(r => r).Count() == 0) + throw new ArgumentNullException("Either a source or target ref must be provided."); + + TextId = textId; + NRefs = nRefs.ToList().ToReadOnlyList(); + N = NRefs.Count; + Segments = Enumerable.Range(0, N).Select(_ => Array.Empty()).ToImmutableArray(); + Flags = Enumerable.Range(0, N).Select(_ => TextRowFlags.SentenceStart).ToImmutableArray(); + } + + public string TextId { get; } + + public object Ref => NRefs.SelectMany(r => r).First(); + + public IReadOnlyList> NRefs { get; } + public int N { get; } + + public IReadOnlyList> Segments { get; set; } + public IReadOnlyList Flags { get; set; } + + public bool GetIsSentenceStart(int i) => + Flags.Count > i ? Flags[i].HasFlag(TextRowFlags.SentenceStart) : throw new ArgumentOutOfRangeException(); + + public bool GetIsInRange(int i) => + Flags.Count > i ? Flags[i].HasFlag(TextRowFlags.InRange) : throw new ArgumentOutOfRangeException(); + + public bool GetIsRangeStart(int i) => + Flags.Count > i ? Flags[i].HasFlag(TextRowFlags.RangeStart) : throw new ArgumentOutOfRangeException(); + + public bool IsEmpty => Segments.Any(s => s.Count == 0); + + public string GetText(int i) => string.Join(" ", Segments[i]); + + public NParallelTextRow Invert() + { + return new NParallelTextRow(TextId, NRefs.Reverse()) { Flags = Flags.Reverse().ToImmutableArray(), }; + } + } +} diff --git a/src/SIL.Machine/Corpora/ParallelCorpusEnumerator.cs b/src/SIL.Machine/Corpora/ParallelCorpusEnumerator.cs new file mode 100644 index 000000000..eed65e5a1 --- /dev/null +++ b/src/SIL.Machine/Corpora/ParallelCorpusEnumerator.cs @@ -0,0 +1,126 @@ +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using SIL.ObjectModel; +using SIL.Scripture; + +namespace SIL.Machine.Corpora +{ + public class ParallelCorpusEnumerator : DisposableBase, IEnumerator + { + private readonly IEnumerator _enumerator; + private readonly bool _isScripture = false; + private readonly Queue _verseRows; + private readonly ScrVers _refVersification; + private TextRow _current; + private bool _isEnumerating = false; + private bool _enumeratorHasMoreData = true; + + public ParallelCorpusEnumerator( + IEnumerator enumerator, + ScrVers refVersification, + ScrVers versification + ) + { + _enumerator = enumerator; + _refVersification = refVersification; + _isScripture = refVersification != null && versification != null && refVersification != versification; + _verseRows = new Queue(); + } + + public TextRow Current => _current; + + object IEnumerator.Current => Current; + + public bool MoveNext() + { + if (_isScripture) + { + if (!_isEnumerating) + { + _enumerator.MoveNext(); + _isEnumerating = true; + } + if (_verseRows.Count == 0 && _enumerator.Current != null && _enumeratorHasMoreData) + CollectVerses(); + if (_verseRows.Count > 0) + { + _current = _verseRows.Dequeue(); + return true; + } + _current = null; + return false; + } + + _enumeratorHasMoreData = _enumerator.MoveNext(); + _current = _enumerator.Current; + return _enumeratorHasMoreData; + } + + public void Reset() + { + _enumerator.Reset(); + _isEnumerating = false; + _enumeratorHasMoreData = true; + } + + protected override void DisposeManagedResources() + { + _enumerator.Dispose(); + } + + private void CollectVerses() + { + var rowList = new List<(ScriptureRef Ref, TextRow Row)>(); + bool outOfOrder = false; + ScriptureRef prevScrRef = ScriptureRef.Empty; + int rangeStartOffset = -1; + do + { + TextRow row = _enumerator.Current; + var scrRef = (ScriptureRef)row.Ref; + if (!prevScrRef.IsEmpty && scrRef.BookNum != prevScrRef.BookNum) + break; + + scrRef = scrRef.ChangeVersification(_refVersification); + // convert one-to-many versification mapping to a verse range + if (scrRef.Equals(prevScrRef)) + { + (ScriptureRef rangeStartVerseRef, TextRow rangeStartRow) = rowList[ + rowList.Count + rangeStartOffset + ]; + TextRowFlags flags = TextRowFlags.InRange; + if (rangeStartRow.IsSentenceStart) + flags |= TextRowFlags.SentenceStart; + if (rangeStartOffset == -1 && (!rangeStartRow.IsInRange || rangeStartRow.IsRangeStart)) + flags |= TextRowFlags.RangeStart; + rowList[rowList.Count + rangeStartOffset] = ( + rangeStartVerseRef, + new TextRow(rangeStartRow.TextId, rangeStartRow.Ref) + { + Segment = rangeStartRow.Segment.Concat(row.Segment).ToArray(), + Flags = flags + } + ); + row = new TextRow(row.TextId, row.Ref) { Flags = TextRowFlags.InRange }; + rangeStartOffset--; + } + else + { + rangeStartOffset = -1; + } + rowList.Add((scrRef, row)); + if (!outOfOrder && scrRef.CompareTo(prevScrRef) < 0) + outOfOrder = true; + prevScrRef = scrRef; + _enumeratorHasMoreData = _enumerator.MoveNext(); + } while (_enumeratorHasMoreData); + + if (outOfOrder) + rowList.Sort((x, y) => x.Ref.CompareTo(y.Ref)); + + foreach ((ScriptureRef _, TextRow row) in rowList) + _verseRows.Enqueue(row); + } + } +} From eb08370c8a8b3f8a90d6842cc3696b6f65811e38 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Wed, 23 Oct 2024 15:04:57 -0400 Subject: [PATCH 02/26] more broken --- .../Corpora/NParallelTextCorpus.cs | 233 ++++++++++-------- src/SIL.Machine/Corpora/NParallelTextRow.cs | 20 +- src/SIL.Machine/Corpora/ParallelTextCorpus.cs | 1 + ...sEnumerator.cs => TextCorpusEnumerator.cs} | 24 +- 4 files changed, 155 insertions(+), 123 deletions(-) rename src/SIL.Machine/Corpora/{ParallelCorpusEnumerator.cs => TextCorpusEnumerator.cs} (84%) diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index e38d12383..4f461fc0e 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -3,6 +3,7 @@ using System.Collections.Generic; using System.Collections.Immutable; using System.Linq; +using SIL.Extensions; using SIL.Linq; using SIL.ObjectModel; using SIL.Scripture; @@ -69,20 +70,13 @@ public override IEnumerable GetRows(IEnumerable textId { for (int i = 0; i < Corpora.Count; i++) { - if (i == 0) - { - enumeratedCorpora.Add(Corpora[0].GetRows(filterTextIds).GetEnumerator()); - } - else - { - enumeratedCorpora.Add( - new ParallelCorpusEnumerator( - Corpora[i].GetRows(filterTextIds).GetEnumerator(), - Corpora[0].Versification, - Corpora[i].Versification - ) - ); - } + enumeratedCorpora.Add( + new TextCorpusEnumerator( + Corpora[i].GetRows(filterTextIds).GetEnumerator(), + Corpora[0].Versification, + Corpora[i].Versification + ) + ); } return GetRows(enumeratedCorpora); } @@ -95,6 +89,12 @@ public override IEnumerable GetRows(IEnumerable textId } } + private bool AnyInRangeWithSegments(IList> listOfEnumerators) + { + return listOfEnumerators.Any(e => e.Current.IsInRange) + && listOfEnumerators.All(e => !(e.Current.IsInRange && e.Current.Segment.Count == 0)); + } + private IList MinRefIndexes(IList refs) { object minRef = refs[0]; @@ -115,18 +115,18 @@ private IList MinRefIndexes(IList refs) return minRefIndexes; } - private IEnumerable GetRows(IList> enumerators) + private IEnumerable GetRows(IList> listOfEnumerators) { { var rangeInfo = new NRangeInfo { Versification = Corpora[0].Versification }; List[] sameRefRows = new List[Corpora.Count]; - bool[] completed = enumerators.Select(e => !e.MoveNext()).ToArray(); + bool[] completed = listOfEnumerators.Select(e => !e.MoveNext()).ToArray(); while (!completed.Any()) { IList minRefIndexes; - IList currentRefs = enumerators.Select(e => e.Current.Ref).ToArray(); + IList currentRefs = listOfEnumerators.Select(e => e.Current.Ref).ToArray(); try { minRefIndexes = MinRefIndexes(currentRefs); @@ -135,34 +135,65 @@ private IEnumerable GetRows(IList> enumer { throw new CorpusAlignmentException(currentRefs.Select(r => r.ToString()).ToArray()); } - if (minRefIndexes.Count == N) + if (minRefIndexes.Count < N) { - if ( - (!AllTargetRows && srcEnumerator.Current.IsInRange) - || (!AllSourceRows && trgEnumerator.Current.IsInRange) - ) + IList nonMinRefIndexes = System.Linq.Enumerable.Range(0, N).Except(minRefIndexes).ToList(); + IReadOnlyList allNonMinRows = nonMinRefIndexes + .Select(i => AllRowsList[i]) + .ToImmutableArray(); + + IList> minEnumerators = minRefIndexes + .Select(i => listOfEnumerators[i]) + .ToList(); + IList> nonMinEnumerators = nonMinRefIndexes + .Select(i => listOfEnumerators[i]) + .ToList(); + + if (!allNonMinRows.Any() && minEnumerators.Select(e => e.Current.IsInRange).Any()) { } + // source is less than target + if (!AllTargetRows && srcEnumerator.Current.IsInRange) { if ( rangeInfo.IsInRange - && ( - ( - srcEnumerator.Current.IsInRange - && !trgEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - ) - || ( - !srcEnumerator.Current.IsInRange - && trgEnumerator.Current.IsInRange - && trgEnumerator.Current.Segment.Count > 0 - ) - || ( - srcEnumerator.Current.IsInRange + && trgEnumerator.Current.IsInRange + && trgEnumerator.Current.Segment.Count > 0 + ) + { + yield return rangeInfo.CreateRow(); + } + rangeInfo.TextId = srcEnumerator.Current.TextId; + rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); + targetSameRefRows.Clear(); + if (rangeInfo.IsSourceEmpty) + rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; + rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); + } + else + { + foreach ( + ParallelTextRow row in CreateSourceRows( + rangeInfo, + srcEnumerator.Current, + targetSameRefRows, + forceTargetInRange: srcEnumerator.Current.TextId == trgEnumerator.Current.TextId + && !trgEnumerator.Current.IsRangeStart && trgEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - && trgEnumerator.Current.Segment.Count > 0 - ) ) ) + { + yield return row; + } + } + + sourceSameRefRows.Add(srcEnumerator.Current); + srcCompleted = !srcEnumerator.MoveNext(); + + if ( + (!AllTargetRows && srcEnumerator.Current.IsInRange) + || (!AllSourceRows && trgEnumerator.Current.IsInRange) + ) + { + if (rangeInfo.IsInRange && AnyInRangeWithSegments(listOfEnumerators)) { yield return rangeInfo.CreateRow(); } @@ -186,7 +217,7 @@ private IEnumerable GetRows(IList> enumer foreach (TextRow prevSourceRow in sourceSameRefRows) { foreach ( - ParallelTextRow row in CreateRows( + NParallelTextRow row in CreateRows( rangeInfo, prevSourceRow, trgEnumerator.Current @@ -203,7 +234,7 @@ ParallelTextRow row in CreateRows( foreach (TextRow prevTargetRow in targetSameRefRows) { foreach ( - ParallelTextRow row in CreateRows( + NParallelTextRow row in CreateRows( rangeInfo, srcEnumerator.Current, prevTargetRow @@ -216,7 +247,7 @@ ParallelTextRow row in CreateRows( } foreach ( - ParallelTextRow row in CreateRows( + NParallelTextRow row in CreateRows( rangeInfo, srcEnumerator.Current, trgEnumerator.Current, @@ -256,7 +287,7 @@ ParallelTextRow row in CreateRows( else { foreach ( - ParallelTextRow row in CreateSourceRows( + NParallelTextRow row in CreateSourceRows( rangeInfo, srcEnumerator.Current, targetSameRefRows, @@ -295,7 +326,7 @@ ParallelTextRow row in CreateSourceRows( else { foreach ( - ParallelTextRow row in CreateTargetRows( + NParallelTextRow row in CreateTargetRows( rangeInfo, trgEnumerator.Current, sourceSameRefRows, @@ -364,7 +395,7 @@ ParallelTextRow row in CreateTargetRows( foreach (TextRow prevSourceRow in sourceSameRefRows) { foreach ( - ParallelTextRow row in CreateRows( + NParallelTextRow row in CreateRows( rangeInfo, prevSourceRow, trgEnumerator.Current @@ -381,7 +412,7 @@ ParallelTextRow row in CreateRows( foreach (TextRow prevTargetRow in targetSameRefRows) { foreach ( - ParallelTextRow row in CreateRows( + NParallelTextRow row in CreateRows( rangeInfo, srcEnumerator.Current, prevTargetRow @@ -394,7 +425,7 @@ ParallelTextRow row in CreateRows( } foreach ( - ParallelTextRow row in CreateRows( + NParallelTextRow row in CreateRows( rangeInfo, srcEnumerator.Current, trgEnumerator.Current, @@ -428,7 +459,11 @@ ParallelTextRow row in CreateRows( else { foreach ( - ParallelTextRow row in CreateSourceRows(rangeInfo, srcEnumerator.Current, targetSameRefRows) + NParallelTextRow row in CreateSourceRows( + rangeInfo, + srcEnumerator.Current, + targetSameRefRows + ) ) { yield return row; @@ -451,7 +486,11 @@ ParallelTextRow row in CreateSourceRows(rangeInfo, srcEnumerator.Current, target else { foreach ( - ParallelTextRow row in CreateTargetRows(rangeInfo, trgEnumerator.Current, sourceSameRefRows) + NParallelTextRow row in CreateTargetRows( + rangeInfo, + trgEnumerator.Current, + sourceSameRefRows + ) ) { yield return row; @@ -465,56 +504,51 @@ ParallelTextRow row in CreateTargetRows(rangeInfo, trgEnumerator.Current, source } } - private IEnumerable CreateRows( - RangeInfo rangeInfo, - TextRow srcRow, - TextRow trgRow, - IReadOnlyCollection alignedWordPairs = null, - bool forceSourceInRange = false, - bool forceTargetInRange = false + private object[] UnifyVersification(object[] refs) + { + if (Corpora[0].Versification == null || refs.Length == 0) + return refs; + return refs.Cast() + .Select(r => r.ChangeVersification(Corpora[0].Versification)) + .Cast() + .ToArray(); + } + + private IEnumerable CreateRows( + NRangeInfo rangeInfo, + IList rows, + IList forceInRange = null ) { if (rangeInfo.IsInRange) yield return rangeInfo.CreateRow(); - string textId; - if (srcRow != null) - textId = srcRow.TextId; - else if (trgRow != null) - textId = trgRow.TextId; - else - throw new ArgumentNullException("Either a source or target must be specified."); - - object[] sourceRefs = srcRow != null ? new object[] { srcRow.Ref } : Array.Empty(); - object[] targetRefs = trgRow != null ? new object[] { trgRow.Ref } : Array.Empty(); - if (targetRefs.Length == 0 && TargetCorpus.IsScripture()) + if (!rows.Any(r => r != null)) + throw new ArgumentNullException("A corpus row must be specified."); + + object[] refRefs = new object[] { rows.Select(r => r?.Ref).First() }; + string textId = null; + IList refs = new List(); + IList flags = new List(); + for (int i = 0; i < rows.Count; i++) { - targetRefs = sourceRefs - .Cast() - .Select(r => r.ChangeVersification(TargetCorpus.Versification)) - .Cast() - .ToArray(); + if (rows[i] != null) + { + textId = textId ?? rows[i].TextId; + refs.Add(UnifyVersification(new object[] { rows[i].Ref })); + flags.Add(rows[i].Flags); + } + else + { + refs.Add(refRefs); + flags.Add(forceInRange[i] ? TextRowFlags.InRange : TextRowFlags.None); + } } - TextRowFlags sourceFlags; - if (srcRow == null) - sourceFlags = forceSourceInRange ? TextRowFlags.InRange : TextRowFlags.None; - else - sourceFlags = srcRow.Flags; - - TextRowFlags targetFlags; - if (trgRow == null) - targetFlags = forceTargetInRange ? TextRowFlags.InRange : TextRowFlags.None; - else - targetFlags = trgRow.Flags; - - yield return new ParallelTextRow(textId, sourceRefs, targetRefs) + yield return new NParallelTextRow(textId, refs) { - SourceSegment = srcRow != null ? srcRow.Segment : Array.Empty(), - TargetSegment = trgRow != null ? trgRow.Segment : Array.Empty(), - AlignedWordPairs = alignedWordPairs, - SourceFlags = sourceFlags, - TargetFlags = targetFlags + NSegments = rows.Select(r => r?.Segment ?? Array.Empty()).ToArray(), + NFlags = flags.ToReadOnlyList() }; } @@ -532,9 +566,10 @@ private bool CheckSameRefRows(List sameRefRows, TextRow otherRow) return sameRefRows.Count > 0; } - private IEnumerable CreateSourceRows( - RangeInfo rangeInfo, + private IEnumerable CreateNRows( + NRangeInfo rangeInfo, TextRow sourceRow, + int index, List targetSameRefRows, bool forceTargetInRange = false ) @@ -543,14 +578,14 @@ private IEnumerable CreateSourceRows( { foreach (TextRow targetSameRefRow in targetSameRefRows) { - foreach (ParallelTextRow row in CreateRows(rangeInfo, sourceRow, targetSameRefRow)) + foreach (NParallelTextRow row in CreateRows(rangeInfo, sourceRow, targetSameRefRow)) yield return row; } } else if (AllSourceRows) { foreach ( - ParallelTextRow row in CreateRows( + NParallelTextRow row in CreateRows( rangeInfo, sourceRow, null, @@ -563,8 +598,8 @@ ParallelTextRow row in CreateRows( } } - private IEnumerable CreateTargetRows( - RangeInfo rangeInfo, + private IEnumerable CreateTargetRows( + NRangeInfo rangeInfo, TextRow targetRow, List sourceSameRefRows, bool forceSourceInRange = false @@ -574,14 +609,14 @@ private IEnumerable CreateTargetRows( { foreach (TextRow sourceSameRefRow in sourceSameRefRows) { - foreach (ParallelTextRow row in CreateRows(rangeInfo, sourceSameRefRow, targetRow)) + foreach (NParallelTextRow row in CreateRows(rangeInfo, sourceSameRefRow, targetRow)) yield return row; } } else if (AllTargetRows) { foreach ( - ParallelTextRow row in CreateRows( + NParallelTextRow row in CreateRows( rangeInfo, null, targetRow, @@ -628,8 +663,8 @@ public NParallelTextRow CreateRow() } var nParRow = new NParallelTextRow(TextId, Rows.Select(r => r.Refs).ToArray()) { - Segments = Rows.Select(r => r.Segment.ToArray()).ToArray(), - Flags = Rows.Select(r => r.IsSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None) + NSegments = Rows.Select(r => r.Segment.ToArray()).ToArray(), + NFlags = Rows.Select(r => r.IsSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None) .ToArray() }; TextId = ""; diff --git a/src/SIL.Machine/Corpora/NParallelTextRow.cs b/src/SIL.Machine/Corpora/NParallelTextRow.cs index 3035be330..da478371b 100644 --- a/src/SIL.Machine/Corpora/NParallelTextRow.cs +++ b/src/SIL.Machine/Corpora/NParallelTextRow.cs @@ -19,8 +19,8 @@ public NParallelTextRow(string textId, IEnumerable> nRefs) TextId = textId; NRefs = nRefs.ToList().ToReadOnlyList(); N = NRefs.Count; - Segments = Enumerable.Range(0, N).Select(_ => Array.Empty()).ToImmutableArray(); - Flags = Enumerable.Range(0, N).Select(_ => TextRowFlags.SentenceStart).ToImmutableArray(); + NSegments = Enumerable.Range(0, N).Select(_ => Array.Empty()).ToImmutableArray(); + NFlags = Enumerable.Range(0, N).Select(_ => TextRowFlags.SentenceStart).ToImmutableArray(); } public string TextId { get; } @@ -30,25 +30,25 @@ public NParallelTextRow(string textId, IEnumerable> nRefs) public IReadOnlyList> NRefs { get; } public int N { get; } - public IReadOnlyList> Segments { get; set; } - public IReadOnlyList Flags { get; set; } + public IReadOnlyList> NSegments { get; set; } + public IReadOnlyList NFlags { get; set; } public bool GetIsSentenceStart(int i) => - Flags.Count > i ? Flags[i].HasFlag(TextRowFlags.SentenceStart) : throw new ArgumentOutOfRangeException(); + NFlags.Count > i ? NFlags[i].HasFlag(TextRowFlags.SentenceStart) : throw new ArgumentOutOfRangeException(); public bool GetIsInRange(int i) => - Flags.Count > i ? Flags[i].HasFlag(TextRowFlags.InRange) : throw new ArgumentOutOfRangeException(); + NFlags.Count > i ? NFlags[i].HasFlag(TextRowFlags.InRange) : throw new ArgumentOutOfRangeException(); public bool GetIsRangeStart(int i) => - Flags.Count > i ? Flags[i].HasFlag(TextRowFlags.RangeStart) : throw new ArgumentOutOfRangeException(); + NFlags.Count > i ? NFlags[i].HasFlag(TextRowFlags.RangeStart) : throw new ArgumentOutOfRangeException(); - public bool IsEmpty => Segments.Any(s => s.Count == 0); + public bool IsEmpty => NSegments.Any(s => s.Count == 0); - public string GetText(int i) => string.Join(" ", Segments[i]); + public string GetText(int i) => string.Join(" ", NSegments[i]); public NParallelTextRow Invert() { - return new NParallelTextRow(TextId, NRefs.Reverse()) { Flags = Flags.Reverse().ToImmutableArray(), }; + return new NParallelTextRow(TextId, NRefs.Reverse()) { NFlags = NFlags.Reverse().ToImmutableArray(), }; } } } diff --git a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs index 2f8a48847..edbadf286 100644 --- a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs @@ -95,6 +95,7 @@ public override IEnumerable GetRows(IEnumerable textIds } if (compare1 < 0) { + // source is less than target if (!AllTargetRows && srcEnumerator.Current.IsInRange) { if ( diff --git a/src/SIL.Machine/Corpora/ParallelCorpusEnumerator.cs b/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs similarity index 84% rename from src/SIL.Machine/Corpora/ParallelCorpusEnumerator.cs rename to src/SIL.Machine/Corpora/TextCorpusEnumerator.cs index eed65e5a1..592bfcc61 100644 --- a/src/SIL.Machine/Corpora/ParallelCorpusEnumerator.cs +++ b/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs @@ -6,7 +6,7 @@ namespace SIL.Machine.Corpora { - public class ParallelCorpusEnumerator : DisposableBase, IEnumerator + public class TextCorpusEnumerator : DisposableBase, IEnumerator { private readonly IEnumerator _enumerator; private readonly bool _isScripture = false; @@ -16,11 +16,7 @@ public class ParallelCorpusEnumerator : DisposableBase, IEnumerator private bool _isEnumerating = false; private bool _enumeratorHasMoreData = true; - public ParallelCorpusEnumerator( - IEnumerator enumerator, - ScrVers refVersification, - ScrVers versification - ) + public TextCorpusEnumerator(IEnumerator enumerator, ScrVers refVersification, ScrVers versification) { _enumerator = enumerator; _refVersification = refVersification; @@ -73,18 +69,18 @@ private void CollectVerses() { var rowList = new List<(ScriptureRef Ref, TextRow Row)>(); bool outOfOrder = false; - ScriptureRef prevScrRef = ScriptureRef.Empty; + ScriptureRef prevRefRef = ScriptureRef.Empty; int rangeStartOffset = -1; do { TextRow row = _enumerator.Current; - var scrRef = (ScriptureRef)row.Ref; - if (!prevScrRef.IsEmpty && scrRef.BookNum != prevScrRef.BookNum) + var refRef = (ScriptureRef)row.Ref; + if (!prevRefRef.IsEmpty && refRef.BookNum != prevRefRef.BookNum) break; - scrRef = scrRef.ChangeVersification(_refVersification); + refRef = refRef.ChangeVersification(_refVersification); // convert one-to-many versification mapping to a verse range - if (scrRef.Equals(prevScrRef)) + if (refRef.Equals(prevRefRef)) { (ScriptureRef rangeStartVerseRef, TextRow rangeStartRow) = rowList[ rowList.Count + rangeStartOffset @@ -109,10 +105,10 @@ private void CollectVerses() { rangeStartOffset = -1; } - rowList.Add((scrRef, row)); - if (!outOfOrder && scrRef.CompareTo(prevScrRef) < 0) + rowList.Add((refRef, row)); + if (!outOfOrder && refRef.CompareTo(prevRefRef) < 0) outOfOrder = true; - prevScrRef = scrRef; + prevRefRef = refRef; _enumeratorHasMoreData = _enumerator.MoveNext(); } while (_enumeratorHasMoreData); From fd763543ae7f82d88ca732c5bbcabd7e63c9440b Mon Sep 17 00:00:00 2001 From: John Lambert Date: Wed, 23 Oct 2024 17:09:10 -0400 Subject: [PATCH 03/26] More broken. --- .../Corpora/NParallelTextCorpus.cs | 82 +++++++++++++++++-- 1 file changed, 74 insertions(+), 8 deletions(-) diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index 4f461fc0e..0530d0cbe 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -149,7 +149,39 @@ private IEnumerable GetRows(IList> listOf .Select(i => listOfEnumerators[i]) .ToList(); - if (!allNonMinRows.Any() && minEnumerators.Select(e => e.Current.IsInRange).Any()) { } + if (!allNonMinRows.Any() && minEnumerators.Select(e => e.Current.IsInRange).Any()) + { + if ( + rangeInfo.IsInRange + && nonMinEnumerators + .Select(e => e.Current.IsInRange && e.Current.Segment.Count > 0) + .Any() + ) + { + yield return rangeInfo.CreateRow(); + } + minRefIndexes.ForEach(i => rangeInfo.AddTextRow(listOfEnumerators[i].Current, i)); + nonMinRefIndexes.ForEach(i => rangeInfo.Rows[i].SameRefRows.Clear()); + } + else + { + foreach ( + NParallelTextRow row in CreateMinRefRows( + rangeInfo, + minEnumerators.Select(e => e.Current).ToList(), + nonMinEnumerators.Select(e => e.Current).ToList(), + allNonMinRows + ) + ) + { + yield return row; + } + foreach (int i in nonMinRefIndexes) + { + rangeInfo.Rows[i].SameRefRows.Add(listOfEnumerators[i].Current); + listOfEnumerators[i].MoveNext(); + } + } // source is less than target if (!AllTargetRows && srcEnumerator.Current.IsInRange) { @@ -566,14 +598,17 @@ private bool CheckSameRefRows(List sameRefRows, TextRow otherRow) return sameRefRows.Count > 0; } - private IEnumerable CreateNRows( + private IEnumerable CreateMinRefRows( NRangeInfo rangeInfo, - TextRow sourceRow, - int index, - List targetSameRefRows, - bool forceTargetInRange = false + IList currentRows, + IList minRefIndexes, + IList nonMinRefIndexes, + bool forceInRange = false ) { + IList minRows = minRefIndexes.Select(i => currentRows[i]).ToList(); + IList nonMinRows = nonMinRefIndexes.Select(i => currentRows[i]).ToList(); + if (CheckSameRefRows(targetSameRefRows, sourceRow)) { foreach (TextRow targetSameRefRow in targetSameRefRows) @@ -631,8 +666,9 @@ NParallelTextRow row in CreateRows( private class RangeRow { - public List Refs { get; } = new List(); - public List Segment { get; } = new List(); + public IList Refs { get; } = new List(); + public IList Segment { get; } = new List(); + public IList SameRefRows { get; } = new List(); public bool IsSentenceStart { get; set; } = false; public bool IsInRange => Refs.Count > 0; public bool IsEmpty => Segment.Count == 0; @@ -643,9 +679,39 @@ private class NRangeInfo public int N = -1; public string TextId { get; set; } = ""; public ScrVers Versification { get; set; } = null; + public IComparer RowRefComparer { get; set; } = null; public List Rows { get; } = new List(); public bool IsInRange => Rows.Any(r => r.IsInRange); + private bool CheckSameRefRows(List sameRefRows, TextRow otherRow) + { + try + { + if (sameRefRows.Count > 0 && RowRefComparer.Compare(sameRefRows[0].Ref, otherRow.Ref) != 0) + sameRefRows.Clear(); + } + catch (ArgumentException) + { + throw new CorpusAlignmentException(sameRefRows[0].Ref.ToString(), otherRow.Ref.ToString()); + } + return sameRefRows.Count > 0; + } + + public void AddTextRow(TextRow row, int index) + { + if (N <= row.Segment.Count) + { + throw new ArgumentOutOfRangeException( + $"There are only {N} parallel texts, but text {index} was chosen." + ); + } + TextId = row.TextId; + Rows[index].Refs.Add(row.Ref); + if (Rows[index].IsEmpty) + Rows[index].IsSentenceStart = row.IsSentenceStart; + Rows[index].Segment.AddRange(row.Segment); + } + public NParallelTextRow CreateRow() { object[] refs = new object[0]; From 266aa90afb9143cb100fd4b6b85ffa81e050e784 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 25 Oct 2024 13:06:26 -0400 Subject: [PATCH 04/26] Compiling but not working --- .../Corpora/NParallelTextCorpus.cs | 481 +++--------------- .../Corpora/TextCorpusEnumerator.cs | 14 +- 2 files changed, 89 insertions(+), 406 deletions(-) diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index 0530d0cbe..dc2b4b6ec 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -1,11 +1,9 @@ using System; -using System.Collections; using System.Collections.Generic; using System.Collections.Immutable; using System.Linq; using SIL.Extensions; using SIL.Linq; -using SIL.ObjectModel; using SIL.Scripture; namespace SIL.Machine.Corpora @@ -89,10 +87,9 @@ public override IEnumerable GetRows(IEnumerable textId } } - private bool AnyInRangeWithSegments(IList> listOfEnumerators) + private bool AnyInRangeWithSegments(IList rows) { - return listOfEnumerators.Any(e => e.Current.IsInRange) - && listOfEnumerators.All(e => !(e.Current.IsInRange && e.Current.Segment.Count == 0)); + return rows.Any(r => r.IsInRange) && rows.All(r => !(r.IsInRange && r.Segment.Count == 0)); } private IList MinRefIndexes(IList refs) @@ -123,19 +120,23 @@ private IEnumerable GetRows(IList> listOf List[] sameRefRows = new List[Corpora.Count]; bool[] completed = listOfEnumerators.Select(e => !e.MoveNext()).ToArray(); - while (!completed.Any()) + while (!completed.All(c => c)) { IList minRefIndexes; - IList currentRefs = listOfEnumerators.Select(e => e.Current.Ref).ToArray(); + IList currentRows = listOfEnumerators + .Where((e, i) => !completed[i]) + .Select(e => e.Current) + .ToArray(); try { - minRefIndexes = MinRefIndexes(currentRefs); + minRefIndexes = MinRefIndexes(currentRows.Select(e => e.Ref).ToArray()); } catch (ArgumentException) { - throw new CorpusAlignmentException(currentRefs.Select(r => r.ToString()).ToArray()); + throw new CorpusAlignmentException(currentRows.Select(e => e.Ref.ToString()).ToArray()); } - if (minRefIndexes.Count < N) + + if (minRefIndexes.Count < (N - completed.Count(c => c))) //then there are some non-min refs { IList nonMinRefIndexes = System.Linq.Enumerable.Range(0, N).Except(minRefIndexes).ToList(); IReadOnlyList allNonMinRows = nonMinRefIndexes @@ -169,8 +170,15 @@ private IEnumerable GetRows(IList> listOf NParallelTextRow row in CreateMinRefRows( rangeInfo, minEnumerators.Select(e => e.Current).ToList(), - nonMinEnumerators.Select(e => e.Current).ToList(), - allNonMinRows + nonMinRefIndexes, + forceInRange: minEnumerators + .Select(e => e.Current.TextId) + .Union(nonMinEnumerators.Select(e => e.Current.TextId)) + .Distinct() + .Count() == 1 + && nonMinEnumerators + .Select(e => !e.Current.IsRangeStart && e.Current.IsInRange) + .Any() ) ) { @@ -182,353 +190,60 @@ NParallelTextRow row in CreateMinRefRows( listOfEnumerators[i].MoveNext(); } } - // source is less than target - if (!AllTargetRows && srcEnumerator.Current.IsInRange) - { - if ( - rangeInfo.IsInRange - && trgEnumerator.Current.IsInRange - && trgEnumerator.Current.Segment.Count > 0 - ) - { - yield return rangeInfo.CreateRow(); - } - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - } - else - { - foreach ( - ParallelTextRow row in CreateSourceRows( - rangeInfo, - srcEnumerator.Current, - targetSameRefRows, - forceTargetInRange: srcEnumerator.Current.TextId == trgEnumerator.Current.TextId - && !trgEnumerator.Current.IsRangeStart - && trgEnumerator.Current.IsInRange - ) - ) - { - yield return row; - } - } - - sourceSameRefRows.Add(srcEnumerator.Current); - srcCompleted = !srcEnumerator.MoveNext(); - - if ( - (!AllTargetRows && srcEnumerator.Current.IsInRange) - || (!AllSourceRows && trgEnumerator.Current.IsInRange) - ) - { - if (rangeInfo.IsInRange && AnyInRangeWithSegments(listOfEnumerators)) - { - yield return rangeInfo.CreateRow(); - } - - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); - } - else - { - if (CheckSameRefRows(sourceSameRefRows, trgEnumerator.Current)) - { - foreach (TextRow prevSourceRow in sourceSameRefRows) - { - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - prevSourceRow, - trgEnumerator.Current - ) - ) - { - yield return row; - } - } - } - - if (CheckSameRefRows(targetSameRefRows, srcEnumerator.Current)) - { - foreach (TextRow prevTargetRow in targetSameRefRows) - { - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - srcEnumerator.Current, - prevTargetRow - ) - ) - { - yield return row; - } - } - } - - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - srcEnumerator.Current, - trgEnumerator.Current, - compare2 == 0 ? alignmentEnumerator.Current.AlignedWordPairs : null - ) - ) - { - yield return row; - } - } - - sourceSameRefRows.Add(srcEnumerator.Current); - srcCompleted = !srcEnumerator.MoveNext(); - - targetSameRefRows.Add(trgEnumerator.Current); - trgCompleted = !trgEnumerator.MoveNext(); - } - if (compare < 0) - { - if (!AllTargetRows && srcEnumerator.Current.IsInRange) - { - if ( - rangeInfo.IsInRange - && trgEnumerator.Current.IsInRange - && trgEnumerator.Current.Segment.Count > 0 - ) - { - yield return rangeInfo.CreateRow(); - } - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - } - else - { - foreach ( - NParallelTextRow row in CreateSourceRows( - rangeInfo, - srcEnumerator.Current, - targetSameRefRows, - forceTargetInRange: srcEnumerator.Current.TextId == trgEnumerator.Current.TextId - && !trgEnumerator.Current.IsRangeStart - && trgEnumerator.Current.IsInRange - ) - ) - { - yield return row; - } - } - - sourceSameRefRows.Add(srcEnumerator.Current); - srcCompleted = !srcEnumerator.MoveNext(); - } - else if (compare > 0) - { - if (!AllSourceRows && trgEnumerator.Current.IsInRange) - { - if ( - rangeInfo.IsInRange - && srcEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - ) - { - yield return rangeInfo.CreateRow(); - } - rangeInfo.TextId = trgEnumerator.Current.TextId; - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); - } - else - { - foreach ( - NParallelTextRow row in CreateTargetRows( - rangeInfo, - trgEnumerator.Current, - sourceSameRefRows, - forceSourceInRange: trgEnumerator.Current.TextId == srcEnumerator.Current.TextId - && !srcEnumerator.Current.IsRangeStart - && srcEnumerator.Current.IsInRange - ) - ) - { - yield return row; - } - } - - targetSameRefRows.Add(trgEnumerator.Current); - trgCompleted = !trgEnumerator.MoveNext(); } - else - // compare == 0 - the refs are the same + else if (minRefIndexes.Count == (N - completed.Count(c => c))) + // the refs are all the same { if ( - (!AllTargetRows && srcEnumerator.Current.IsInRange) - || (!AllSourceRows && trgEnumerator.Current.IsInRange) + !currentRows.Select((r, i) => AllRowsList[i]).Any() + && currentRows.Select(r => r.IsInRange).Any() ) { - if ( - rangeInfo.IsInRange - && ( - ( - srcEnumerator.Current.IsInRange - && !trgEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - ) - || ( - !srcEnumerator.Current.IsInRange - && trgEnumerator.Current.IsInRange - && trgEnumerator.Current.Segment.Count > 0 - ) - || ( - srcEnumerator.Current.IsInRange - && trgEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - && trgEnumerator.Current.Segment.Count > 0 - ) - ) - ) + if (rangeInfo.IsInRange && AnyInRangeWithSegments(currentRows)) { yield return rangeInfo.CreateRow(); } - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); + for (int i = 0; i < currentRows.Count; i++) + { + rangeInfo.AddTextRow(currentRows[i], i); + rangeInfo.Rows[i].SameRefRows.Clear(); + } } else { - if (CheckSameRefRows(sourceSameRefRows, trgEnumerator.Current)) + foreach (var row in currentRows) //TODO walk through together { - foreach (TextRow prevSourceRow in sourceSameRefRows) + if (rangeInfo.CheckSameRefRows(row)) { - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - prevSourceRow, - trgEnumerator.Current - ) - ) + foreach (TextRow tr in rangeInfo.Rows.SelectMany(r => r.SameRefRows)) { - yield return row; - } - } - } - - if (CheckSameRefRows(targetSameRefRows, srcEnumerator.Current)) - { - foreach (TextRow prevTargetRow in targetSameRefRows) - { - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - srcEnumerator.Current, - prevTargetRow + foreach ( + NParallelTextRow r in CreateRows(rangeInfo, new List { tr, row }) ) - ) - { - yield return row; + { + yield return r; + } } } } - - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - srcEnumerator.Current, - trgEnumerator.Current, - compare2 == 0 ? alignmentEnumerator.Current.AlignedWordPairs : null - ) - ) + foreach (NParallelTextRow row in CreateRows(rangeInfo, currentRows)) { yield return row; } } - sourceSameRefRows.Add(srcEnumerator.Current); - srcCompleted = !srcEnumerator.MoveNext(); - - targetSameRefRows.Add(trgEnumerator.Current); - trgCompleted = !trgEnumerator.MoveNext(); - } - } - - while (!srcCompleted) - { - if (!AllTargetRows && srcEnumerator.Current.IsInRange) - { - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - } - else - { - foreach ( - NParallelTextRow row in CreateSourceRows( - rangeInfo, - srcEnumerator.Current, - targetSameRefRows - ) - ) + for (int i = 0; i < currentRows.Count; i++) { - yield return row; + rangeInfo.Rows[i].SameRefRows.Add(currentRows[i]); } } - srcCompleted = !srcEnumerator.MoveNext(); - } - - while (!trgCompleted) - { - if (!AllSourceRows && trgEnumerator.Current.IsInRange) - { - rangeInfo.TextId = trgEnumerator.Current.TextId; - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); - } else { - foreach ( - NParallelTextRow row in CreateTargetRows( - rangeInfo, - trgEnumerator.Current, - sourceSameRefRows - ) - ) - { - yield return row; - } + throw new CorpusAlignmentException( + minRefIndexes.Select(i => currentRows[i].Ref.ToString()).ToArray() + ); } - trgCompleted = !trgEnumerator.MoveNext(); } if (rangeInfo.IsInRange) @@ -573,7 +288,7 @@ private IEnumerable CreateRows( else { refs.Add(refRefs); - flags.Add(forceInRange[i] ? TextRowFlags.InRange : TextRowFlags.None); + flags.Add(forceInRange == null || !forceInRange[i] ? TextRowFlags.None : TextRowFlags.InRange); } } @@ -584,82 +299,35 @@ private IEnumerable CreateRows( }; } - private bool CheckSameRefRows(List sameRefRows, TextRow otherRow) - { - try - { - if (sameRefRows.Count > 0 && RowRefComparer.Compare(sameRefRows[0].Ref, otherRow.Ref) != 0) - sameRefRows.Clear(); - } - catch (ArgumentException) - { - throw new CorpusAlignmentException(sameRefRows[0].Ref.ToString(), otherRow.Ref.ToString()); - } - return sameRefRows.Count > 0; - } - private IEnumerable CreateMinRefRows( NRangeInfo rangeInfo, - IList currentRows, - IList minRefIndexes, + IList minRefRows, IList nonMinRefIndexes, bool forceInRange = false ) { - IList minRows = minRefIndexes.Select(i => currentRows[i]).ToList(); - IList nonMinRows = nonMinRefIndexes.Select(i => currentRows[i]).ToList(); - - if (CheckSameRefRows(targetSameRefRows, sourceRow)) - { - foreach (TextRow targetSameRefRow in targetSameRefRows) - { - foreach (NParallelTextRow row in CreateRows(rangeInfo, sourceRow, targetSameRefRow)) - yield return row; - } - } - else if (AllSourceRows) - { - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - sourceRow, - null, - forceTargetInRange: forceTargetInRange - ) - ) - { - yield return row; - } - } - } + List sameRefRows = rangeInfo + .Rows.Where((r, i) => nonMinRefIndexes.Contains(i)) + .SelectMany(r => r.SameRefRows) + .ToList(); - private IEnumerable CreateTargetRows( - NRangeInfo rangeInfo, - TextRow targetRow, - List sourceSameRefRows, - bool forceSourceInRange = false - ) - { - if (CheckSameRefRows(sourceSameRefRows, targetRow)) + foreach (TextRow textRow in minRefRows) { - foreach (TextRow sourceSameRefRow in sourceSameRefRows) + if (rangeInfo.CheckSameRefRows(sameRefRows, textRow)) { - foreach (NParallelTextRow row in CreateRows(rangeInfo, sourceSameRefRow, targetRow)) - yield return row; - } - } - else if (AllTargetRows) - { - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - null, - targetRow, - forceSourceInRange: forceSourceInRange - ) - ) - { - yield return row; + foreach (TextRow sameRefRow in sameRefRows) + { + foreach ( + NParallelTextRow row in CreateRows( + rangeInfo, + new List() { textRow, sameRefRow }, + forceInRange: new List() { false, forceInRange } + ) + ) + { + yield return row; + } + } } } } @@ -683,7 +351,7 @@ private class NRangeInfo public List Rows { get; } = new List(); public bool IsInRange => Rows.Any(r => r.IsInRange); - private bool CheckSameRefRows(List sameRefRows, TextRow otherRow) + public bool CheckSameRefRows(IList sameRefRows, TextRow otherRow) { try { @@ -697,6 +365,21 @@ private bool CheckSameRefRows(List sameRefRows, TextRow otherRow) return sameRefRows.Count > 0; } + public bool CheckSameRefRows(TextRow row) + { + var sameRefRows = Rows.SelectMany(r => r.SameRefRows).ToList(); + try + { + if (sameRefRows.Count > 0 && RowRefComparer.Compare(sameRefRows[0].Ref, row.Ref) != 0) + sameRefRows.Clear(); + } + catch (ArgumentException) + { + throw new CorpusAlignmentException(sameRefRows[0].Ref.ToString(), row.Ref.ToString()); + } + return sameRefRows.Count > 0; + } + public void AddTextRow(TextRow row, int index) { if (N <= row.Segment.Count) @@ -727,7 +410,7 @@ public NParallelTextRow CreateRow() .ToArray(); } } - var nParRow = new NParallelTextRow(TextId, Rows.Select(r => r.Refs).ToArray()) + var nParRow = new NParallelTextRow(TextId, Rows.Select(r => r.Refs.ToList()).ToArray()) { NSegments = Rows.Select(r => r.Segment.ToArray()).ToArray(), NFlags = Rows.Select(r => r.IsSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None) diff --git a/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs b/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs index 592bfcc61..a0fed87b0 100644 --- a/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs +++ b/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs @@ -74,13 +74,13 @@ private void CollectVerses() do { TextRow row = _enumerator.Current; - var refRef = (ScriptureRef)row.Ref; - if (!prevRefRef.IsEmpty && refRef.BookNum != prevRefRef.BookNum) + var scrRef = (ScriptureRef)row.Ref; + if (!prevRefRef.IsEmpty && scrRef.BookNum != prevRefRef.BookNum) break; - refRef = refRef.ChangeVersification(_refVersification); + scrRef = scrRef.ChangeVersification(_refVersification); // convert one-to-many versification mapping to a verse range - if (refRef.Equals(prevRefRef)) + if (scrRef.Equals(prevRefRef)) { (ScriptureRef rangeStartVerseRef, TextRow rangeStartRow) = rowList[ rowList.Count + rangeStartOffset @@ -105,10 +105,10 @@ private void CollectVerses() { rangeStartOffset = -1; } - rowList.Add((refRef, row)); - if (!outOfOrder && refRef.CompareTo(prevRefRef) < 0) + rowList.Add((scrRef, row)); + if (!outOfOrder && scrRef.CompareTo(prevRefRef) < 0) outOfOrder = true; - prevRefRef = refRef; + prevRefRef = scrRef; _enumeratorHasMoreData = _enumerator.MoveNext(); } while (_enumeratorHasMoreData); From e76177c47a25723399943d9e348b2414994a20f5 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 30 Oct 2024 17:19:25 -0400 Subject: [PATCH 05/26] Progress --- src/SIL.Machine/Corpora/CorporaExtensions.cs | 76 +++ .../Corpora/NParallelTextCorpus.cs | 152 +++-- src/SIL.Machine/Corpora/NParallelTextRow.cs | 2 +- src/SIL.Machine/Corpora/ParallelTextCorpus.cs | 524 ++++++++++-------- .../Corpora/TextCorpusEnumerator.cs | 14 +- .../Corpora/ParallelTextCorpusTests.cs | 32 +- 6 files changed, 486 insertions(+), 314 deletions(-) diff --git a/src/SIL.Machine/Corpora/CorporaExtensions.cs b/src/SIL.Machine/Corpora/CorporaExtensions.cs index 7d974366e..b2247a972 100644 --- a/src/SIL.Machine/Corpora/CorporaExtensions.cs +++ b/src/SIL.Machine/Corpora/CorporaExtensions.cs @@ -367,6 +367,16 @@ public static ITextCorpus FilterTexts(this ITextCorpus corpus, IEnumerable GetRows(IEnumerable textIds) } } + private enum MergeRule + { + First = 1, + Random = 2 + } + + private class MergedCorpus : TextCorpusBase + { + private readonly NParallelTextCorpus _corpus; + + private readonly MergeRule _mergeRule; + + private readonly Random _random; + + private readonly int _seed; + + public MergedCorpus(NParallelTextCorpus nParallelTextCorpus, MergeRule mergeRule, int seed) + { + _corpus = nParallelTextCorpus; + _mergeRule = mergeRule; + _seed = seed; + _random = new Random(_seed); + } + + public override IEnumerable Texts => _corpus.Corpora.SelectMany(c => c.Texts); + + public override bool IsTokenized => + Enumerable.Range(0, _corpus.N).Select(i => _corpus.GetIsTokenized(i)).All(b => b); + + public override ScrVers Versification => _corpus.N > 0 ? _corpus.Corpora.First().Versification : null; + + public override IEnumerable GetRows(IEnumerable textIds) + { + foreach (NParallelTextRow nRow in _corpus.GetRows()) + { + if (nRow.N == 0 || nRow.IsEmpty) + continue; + IReadOnlyList nonEmptyIndices = nRow + .NSegments.Select((s, i) => (s, i)) + .Where(pair => pair.s.Count > 0) + .Select(pair => pair.i) + .ToList(); + IReadOnlyList indices = + nonEmptyIndices.Count > 0 ? nonEmptyIndices : Enumerable.Range(0, nRow.N).ToList(); + switch (_mergeRule) + { + case MergeRule.First: + yield return new TextRow(nRow.TextId, nRow.NRefs[indices.First()]) + { + Segment = nRow.NSegments[indices.First()], + Flags = nRow.NFlags[indices.First()] + }; + break; + case MergeRule.Random: + int i = _random.Next(0, indices.Count); + yield return new TextRow(nRow.TextId, nRow.NRefs[i]) + { + Segment = nRow.NSegments[i], + Flags = nRow.NFlags[i] + }; + break; + } + } + } + } + #endregion #region IAlignmentCorpus operations diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index dc2b4b6ec..de325b311 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -64,19 +64,17 @@ public override IEnumerable GetRows(IEnumerable textId filterTextIds.IntersectWith(textIds); IList> enumeratedCorpora = new List>(); + IEnumerable ret = new List() { }; try { for (int i = 0; i < Corpora.Count; i++) { + var enumerator = Corpora[i].GetRows(filterTextIds).GetEnumerator(); enumeratedCorpora.Add( - new TextCorpusEnumerator( - Corpora[i].GetRows(filterTextIds).GetEnumerator(), - Corpora[0].Versification, - Corpora[i].Versification - ) + new TextCorpusEnumerator(enumerator, Corpora[0].Versification, Corpora[i].Versification) ); } - return GetRows(enumeratedCorpora); + ret = GetRows(enumeratedCorpora).ToList(); //TODO cleanup } finally { @@ -85,6 +83,7 @@ public override IEnumerable GetRows(IEnumerable textId enumerator.Dispose(); } } + return ret; } private bool AnyInRangeWithSegments(IList rows) @@ -95,7 +94,7 @@ private bool AnyInRangeWithSegments(IList rows) private IList MinRefIndexes(IList refs) { object minRef = refs[0]; - IList minRefIndexes = new List(0); + IList minRefIndexes = new List() { 0 }; for (int i = 1; i < refs.Count; i++) { if (RowRefComparer.Compare(refs[i], minRef) < 0) @@ -115,7 +114,11 @@ private IList MinRefIndexes(IList refs) private IEnumerable GetRows(IList> listOfEnumerators) { { - var rangeInfo = new NRangeInfo { Versification = Corpora[0].Versification }; + var rangeInfo = new NRangeInfo(N) + { + Versification = Corpora[0].Versification, + RowRefComparer = RowRefComparer + }; List[] sameRefRows = new List[Corpora.Count]; bool[] completed = listOfEnumerators.Select(e => !e.MoveNext()).ToArray(); @@ -123,22 +126,29 @@ private IEnumerable GetRows(IList> listOf while (!completed.All(c => c)) { IList minRefIndexes; - IList currentRows = listOfEnumerators - .Where((e, i) => !completed[i]) - .Select(e => e.Current) - .ToArray(); + IList currentRows = listOfEnumerators.Select(e => e.Current).ToArray(); try { - minRefIndexes = MinRefIndexes(currentRows.Select(e => e.Ref).ToArray()); + minRefIndexes = MinRefIndexes( + currentRows + .Select(e => + { + if (e != null) + return e.Ref; + return null; + }) + .ToArray() + ); } catch (ArgumentException) { throw new CorpusAlignmentException(currentRows.Select(e => e.Ref.ToString()).ToArray()); } + var currentIncompleteRows = currentRows.Where((r, i) => !completed[i]).ToArray(); + IList nonMinRefIndexes = System.Linq.Enumerable.Range(0, N).Except(minRefIndexes).ToList(); if (minRefIndexes.Count < (N - completed.Count(c => c))) //then there are some non-min refs { - IList nonMinRefIndexes = System.Linq.Enumerable.Range(0, N).Except(minRefIndexes).ToList(); IReadOnlyList allNonMinRows = nonMinRefIndexes .Select(i => AllRowsList[i]) .ToImmutableArray(); @@ -170,6 +180,7 @@ private IEnumerable GetRows(IList> listOf NParallelTextRow row in CreateMinRefRows( rangeInfo, minEnumerators.Select(e => e.Current).ToList(), + minEnumerators.Where((e, i) => AllRowsList[i]).Select(e => e.Current).ToList(), nonMinRefIndexes, forceInRange: minEnumerators .Select(e => e.Current.TextId) @@ -184,27 +195,33 @@ NParallelTextRow row in CreateMinRefRows( { yield return row; } - foreach (int i in nonMinRefIndexes) - { - rangeInfo.Rows[i].SameRefRows.Add(listOfEnumerators[i].Current); - listOfEnumerators[i].MoveNext(); - } + } + foreach (int i in minRefIndexes) + { + rangeInfo.Rows[i].SameRefRows.Add(listOfEnumerators[i].Current); + completed[i] = !listOfEnumerators[i].MoveNext(); } } else if (minRefIndexes.Count == (N - completed.Count(c => c))) // the refs are all the same { if ( - !currentRows.Select((r, i) => AllRowsList[i]).Any() - && currentRows.Select(r => r.IsInRange).Any() + minRefIndexes + .Select(i => + !AllRowsList[i] + && minRefIndexes + .Select(j => j != i && !completed[i] && listOfEnumerators[i].Current.IsInRange) + .Any(b => b) + ) + .Any(b => b) ) { - if (rangeInfo.IsInRange && AnyInRangeWithSegments(currentRows)) + if (rangeInfo.IsInRange && AnyInRangeWithSegments(currentIncompleteRows)) { yield return rangeInfo.CreateRow(); } - for (int i = 0; i < currentRows.Count; i++) + for (int i = 0; i < rangeInfo.Rows.Count; i++) { rangeInfo.AddTextRow(currentRows[i], i); rangeInfo.Rows[i].SameRefRows.Clear(); @@ -212,30 +229,47 @@ NParallelTextRow row in CreateMinRefRows( } else { - foreach (var row in currentRows) //TODO walk through together + for (int i = 0; i < rangeInfo.Rows.Count - 1; i++) { - if (rangeInfo.CheckSameRefRows(row)) + for (int j = 0; j < rangeInfo.Rows.Count; j++) { - foreach (TextRow tr in rangeInfo.Rows.SelectMany(r => r.SameRefRows)) + if (j <= i || completed[i] || completed[j]) + continue; + + if (rangeInfo.CheckSameRefRows(rangeInfo.Rows[i].SameRefRows, currentRows[j])) { - foreach ( - NParallelTextRow r in CreateRows(rangeInfo, new List { tr, row }) - ) + foreach (TextRow tr in rangeInfo.Rows[i].SameRefRows) { - yield return r; + foreach ( + NParallelTextRow r in CreateRows( + rangeInfo, + rangeInfo.Rows[i].IsInRange, + new List { tr, currentRows[i] } + ) + ) + { + yield return r; + } } } } } - foreach (NParallelTextRow row in CreateRows(rangeInfo, currentRows)) + foreach ( + NParallelTextRow row in CreateRows( + rangeInfo, + rangeInfo.IsInRange, + currentIncompleteRows + ) + ) { yield return row; } } - for (int i = 0; i < currentRows.Count; i++) + for (int i = 0; i < rangeInfo.Rows.Count; i++) { rangeInfo.Rows[i].SameRefRows.Add(currentRows[i]); + completed[i] = !listOfEnumerators[i].MoveNext(); } } else @@ -246,7 +280,7 @@ NParallelTextRow row in CreateMinRefRows( } } - if (rangeInfo.IsInRange) + if (rangeInfo.IsInRange) //TODO yield return rangeInfo.CreateRow(); } } @@ -263,14 +297,15 @@ private object[] UnifyVersification(object[] refs) private IEnumerable CreateRows( NRangeInfo rangeInfo, + bool isInRange, IList rows, IList forceInRange = null ) { - if (rangeInfo.IsInRange) + if (isInRange) yield return rangeInfo.CreateRow(); - if (!rows.Any(r => r != null)) + if (rows.All(r => r == null)) throw new ArgumentNullException("A corpus row must be specified."); object[] refRefs = new object[] { rows.Select(r => r?.Ref).First() }; @@ -302,6 +337,7 @@ private IEnumerable CreateRows( private IEnumerable CreateMinRefRows( NRangeInfo rangeInfo, IList minRefRows, + IList allRowsMinRefRows, IList nonMinRefIndexes, bool forceInRange = false ) @@ -320,6 +356,7 @@ private IEnumerable CreateMinRefRows( foreach ( NParallelTextRow row in CreateRows( rangeInfo, + rangeInfo.IsInRange, new List() { textRow, sameRefRow }, forceInRange: new List() { false, forceInRange } ) @@ -330,6 +367,20 @@ NParallelTextRow row in CreateRows( } } } + foreach (TextRow textRow in allRowsMinRefRows) + { + foreach ( + NParallelTextRow row in CreateRows( + rangeInfo, + textRow.IsInRange, + new List { textRow }, //TODO empty not non-existent + new List { forceInRange } + ) + ) + { + yield return row; + } + } } private class RangeRow @@ -344,45 +395,40 @@ private class RangeRow private class NRangeInfo { - public int N = -1; + public int N; public string TextId { get; set; } = ""; public ScrVers Versification { get; set; } = null; public IComparer RowRefComparer { get; set; } = null; - public List Rows { get; } = new List(); + public List Rows { get; } public bool IsInRange => Rows.Any(r => r.IsInRange); - public bool CheckSameRefRows(IList sameRefRows, TextRow otherRow) + public NRangeInfo(int n) { - try + N = n; + Rows = new List(); + for (int i = 0; i < N; i++) { - if (sameRefRows.Count > 0 && RowRefComparer.Compare(sameRefRows[0].Ref, otherRow.Ref) != 0) - sameRefRows.Clear(); + Rows.Add(new RangeRow()); } - catch (ArgumentException) - { - throw new CorpusAlignmentException(sameRefRows[0].Ref.ToString(), otherRow.Ref.ToString()); - } - return sameRefRows.Count > 0; } - public bool CheckSameRefRows(TextRow row) + public bool CheckSameRefRows(IList sameRefRows, TextRow otherRow) { - var sameRefRows = Rows.SelectMany(r => r.SameRefRows).ToList(); try { - if (sameRefRows.Count > 0 && RowRefComparer.Compare(sameRefRows[0].Ref, row.Ref) != 0) + if (sameRefRows.Count > 0 && RowRefComparer.Compare(sameRefRows[0].Ref, otherRow.Ref) != 0) sameRefRows.Clear(); } catch (ArgumentException) { - throw new CorpusAlignmentException(sameRefRows[0].Ref.ToString(), row.Ref.ToString()); + throw new CorpusAlignmentException(sameRefRows[0].Ref.ToString(), otherRow.Ref.ToString()); } return sameRefRows.Count > 0; } public void AddTextRow(TextRow row, int index) { - if (N <= row.Segment.Count) + if (N <= index) { throw new ArgumentOutOfRangeException( $"There are only {N} parallel texts, but text {index} was chosen." @@ -434,6 +480,10 @@ public int Compare(object x, object y) // Do not use the default comparer for ScriptureRef, since we want to ignore segments if (x is ScriptureRef sx && y is ScriptureRef sy) return sx.CompareTo(sy, compareSegments: false); + if (x == null && y != null) + return 1; + if (x != null && y == null) + return -1; return Comparer.Default.Compare(x, y); } diff --git a/src/SIL.Machine/Corpora/NParallelTextRow.cs b/src/SIL.Machine/Corpora/NParallelTextRow.cs index da478371b..146ba6009 100644 --- a/src/SIL.Machine/Corpora/NParallelTextRow.cs +++ b/src/SIL.Machine/Corpora/NParallelTextRow.cs @@ -42,7 +42,7 @@ public bool GetIsInRange(int i) => public bool GetIsRangeStart(int i) => NFlags.Count > i ? NFlags[i].HasFlag(TextRowFlags.RangeStart) : throw new ArgumentOutOfRangeException(); - public bool IsEmpty => NSegments.Any(s => s.Count == 0); + public bool IsEmpty => NSegments.All(s => s.Count == 0); public string GetText(int i) => string.Join(" ", NSegments[i]); diff --git a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs index edbadf286..9b9f668e3 100644 --- a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs @@ -20,6 +20,7 @@ public ParallelTextCorpus( TargetCorpus = targetCorpus; AlignmentCorpus = alignmentCorpus ?? new DictionaryAlignmentCorpus(); RowRefComparer = rowRefComparer ?? new DefaultRowRefComparer(); + NParallelTextCorpus = new NParallelTextCorpus(new List { SourceCorpus, TargetCorpus }); } public override bool IsSourceTokenized => SourceCorpus.IsTokenized; @@ -30,315 +31,356 @@ public ParallelTextCorpus( public ITextCorpus SourceCorpus { get; } public ITextCorpus TargetCorpus { get; } + + public NParallelTextCorpus NParallelTextCorpus { get; set; } public IAlignmentCorpus AlignmentCorpus { get; } public IComparer RowRefComparer { get; } public override IEnumerable GetRows(IEnumerable textIds) { - IEnumerable sourceTextIds = SourceCorpus.Texts.Select(t => t.Id); - IEnumerable targetTextIds = TargetCorpus.Texts.Select(t => t.Id); - - HashSet filterTextIds; - if (AllSourceRows && AllTargetRows) - { - filterTextIds = new HashSet(sourceTextIds); - filterTextIds.UnionWith(targetTextIds); - } - else if (!AllSourceRows && !AllTargetRows) - { - filterTextIds = new HashSet(sourceTextIds); - filterTextIds.IntersectWith(targetTextIds); - } - else if (AllSourceRows) + if (2 > RowRefComparer.Compare(0, 0)) { - filterTextIds = new HashSet(sourceTextIds); + //TODO rework - just for testing + NParallelTextCorpus.AllRowsList = new bool[] { AllSourceRows, AllTargetRows }; + + foreach (var nRow in NParallelTextCorpus.GetRows(textIds)) + { + bool hasTarget = nRow.N > 1; + if (!hasTarget && !AllTargetRows) + continue; + yield return new ParallelTextRow( + nRow.TextId, + nRow.NRefs[0], + hasTarget ? nRow.NRefs[1] : new object[] { } + ) + { + SourceFlags = nRow.NFlags[0], + TargetFlags = hasTarget ? nRow.NFlags[1] : new TextRowFlags(), + SourceSegment = nRow.NSegments[0], + TargetSegment = hasTarget ? nRow.NSegments[1] : new string[] { } + }; + } } else { - filterTextIds = new HashSet(targetTextIds); - } + IEnumerable sourceTextIds = SourceCorpus.Texts.Select(t => t.Id); + IEnumerable targetTextIds = TargetCorpus.Texts.Select(t => t.Id); - if (textIds != null) - filterTextIds.IntersectWith(textIds); + HashSet filterTextIds; + if (AllSourceRows && AllTargetRows) + { + filterTextIds = new HashSet(sourceTextIds); + filterTextIds.UnionWith(targetTextIds); + } + else if (!AllSourceRows && !AllTargetRows) + { + filterTextIds = new HashSet(sourceTextIds); + filterTextIds.IntersectWith(targetTextIds); + } + else if (AllSourceRows) + { + filterTextIds = new HashSet(sourceTextIds); + } + else + { + filterTextIds = new HashSet(targetTextIds); + } - using (IEnumerator srcEnumerator = SourceCorpus.GetRows(filterTextIds).GetEnumerator()) - using ( - var trgEnumerator = new TargetCorpusEnumerator( - TargetCorpus.GetRows(filterTextIds).GetEnumerator(), - SourceCorpus.Versification, - TargetCorpus.Versification - ) - ) - using ( - IEnumerator alignmentEnumerator = AlignmentCorpus.GetRows(filterTextIds).GetEnumerator() - ) - { - var rangeInfo = new RangeInfo { TargetVersification = TargetCorpus.Versification }; - var sourceSameRefRows = new List(); - var targetSameRefRows = new List(); + if (textIds != null) + filterTextIds.IntersectWith(textIds); - bool srcCompleted = !srcEnumerator.MoveNext(); - bool trgCompleted = !trgEnumerator.MoveNext(); - while (!srcCompleted && !trgCompleted) + using (IEnumerator srcEnumerator = SourceCorpus.GetRows(filterTextIds).GetEnumerator()) + using ( + var trgEnumerator = new TargetCorpusEnumerator( + TargetCorpus.GetRows(filterTextIds).GetEnumerator(), + SourceCorpus.Versification, + TargetCorpus.Versification + ) + ) + using ( + IEnumerator alignmentEnumerator = AlignmentCorpus + .GetRows(filterTextIds) + .GetEnumerator() + ) { - int compare1 = 0; - try - { - compare1 = RowRefComparer.Compare(srcEnumerator.Current.Ref, trgEnumerator.Current.Ref); - } - catch (ArgumentException) - { - throw new CorpusAlignmentException( - srcEnumerator.Current.Ref.ToString(), - trgEnumerator.Current.Ref.ToString() - ); - } - if (compare1 < 0) + var rangeInfo = new RangeInfo { TargetVersification = TargetCorpus.Versification }; + var sourceSameRefRows = new List(); + var targetSameRefRows = new List(); + + bool srcCompleted = !srcEnumerator.MoveNext(); + bool trgCompleted = !trgEnumerator.MoveNext(); + while (!srcCompleted && !trgCompleted) { - // source is less than target - if (!AllTargetRows && srcEnumerator.Current.IsInRange) + int compare1 = 0; + try { - if ( - rangeInfo.IsInRange - && trgEnumerator.Current.IsInRange - && trgEnumerator.Current.Segment.Count > 0 - ) - { - yield return rangeInfo.CreateRow(); - } - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); + compare1 = RowRefComparer.Compare(srcEnumerator.Current.Ref, trgEnumerator.Current.Ref); } - else + catch (ArgumentException) { - foreach ( - ParallelTextRow row in CreateSourceRows( - rangeInfo, - srcEnumerator.Current, - targetSameRefRows, - forceTargetInRange: srcEnumerator.Current.TextId == trgEnumerator.Current.TextId - && !trgEnumerator.Current.IsRangeStart - && trgEnumerator.Current.IsInRange - ) - ) - { - yield return row; - } + throw new CorpusAlignmentException( + srcEnumerator.Current.Ref.ToString(), + trgEnumerator.Current.Ref.ToString() + ); } - - sourceSameRefRows.Add(srcEnumerator.Current); - srcCompleted = !srcEnumerator.MoveNext(); - } - else if (compare1 > 0) - { - if (!AllSourceRows && trgEnumerator.Current.IsInRange) + if (compare1 < 0) { - if ( - rangeInfo.IsInRange - && srcEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - ) + // source is less than target + if (!AllTargetRows && srcEnumerator.Current.IsInRange) { - yield return rangeInfo.CreateRow(); - } - rangeInfo.TextId = trgEnumerator.Current.TextId; - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); - } - else - { - foreach ( - ParallelTextRow row in CreateTargetRows( - rangeInfo, - trgEnumerator.Current, - sourceSameRefRows, - forceSourceInRange: trgEnumerator.Current.TextId == srcEnumerator.Current.TextId - && !srcEnumerator.Current.IsRangeStart - && srcEnumerator.Current.IsInRange + if ( + rangeInfo.IsInRange + && trgEnumerator.Current.IsInRange + && trgEnumerator.Current.Segment.Count > 0 ) - ) + { + yield return rangeInfo.CreateRow(); + } + rangeInfo.TextId = srcEnumerator.Current.TextId; + rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); + targetSameRefRows.Clear(); + if (rangeInfo.IsSourceEmpty) + rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; + rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); + } + else { - yield return row; + foreach ( + ParallelTextRow row in CreateSourceRows( + rangeInfo, + srcEnumerator.Current, + targetSameRefRows, + forceTargetInRange: srcEnumerator.Current.TextId == trgEnumerator.Current.TextId + && !trgEnumerator.Current.IsRangeStart + && trgEnumerator.Current.IsInRange + ) + ) + { + yield return row; + } } - } - targetSameRefRows.Add(trgEnumerator.Current); - trgCompleted = !trgEnumerator.MoveNext(); - } - else - { - int compare2; - do + sourceSameRefRows.Add(srcEnumerator.Current); + srcCompleted = !srcEnumerator.MoveNext(); + } + else if (compare1 > 0) { - try + if (!AllSourceRows && trgEnumerator.Current.IsInRange) { - compare2 = alignmentEnumerator.MoveNext() - ? RowRefComparer.Compare(srcEnumerator.Current.Ref, alignmentEnumerator.Current.Ref) - : 1; + if ( + rangeInfo.IsInRange + && srcEnumerator.Current.IsInRange + && srcEnumerator.Current.Segment.Count > 0 + ) + { + yield return rangeInfo.CreateRow(); + } + rangeInfo.TextId = trgEnumerator.Current.TextId; + rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); + sourceSameRefRows.Clear(); + if (rangeInfo.IsTargetEmpty) + rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; + rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); } - catch (ArgumentException) + else { - throw new CorpusAlignmentException( - srcEnumerator.Current.Ref.ToString(), - trgEnumerator.Current.Ref.ToString() - ); - } - } while (compare2 < 0); - - if ( - (!AllTargetRows && srcEnumerator.Current.IsInRange) - || (!AllSourceRows && trgEnumerator.Current.IsInRange) - ) - { - if ( - rangeInfo.IsInRange - && ( - ( - srcEnumerator.Current.IsInRange - && !trgEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - ) - || ( - !srcEnumerator.Current.IsInRange - && trgEnumerator.Current.IsInRange - && trgEnumerator.Current.Segment.Count > 0 - ) - || ( - srcEnumerator.Current.IsInRange - && trgEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - && trgEnumerator.Current.Segment.Count > 0 + foreach ( + ParallelTextRow row in CreateTargetRows( + rangeInfo, + trgEnumerator.Current, + sourceSameRefRows, + forceSourceInRange: trgEnumerator.Current.TextId == srcEnumerator.Current.TextId + && !srcEnumerator.Current.IsRangeStart + && srcEnumerator.Current.IsInRange ) ) - ) - { - yield return rangeInfo.CreateRow(); + { + yield return row; + } } - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); + targetSameRefRows.Add(trgEnumerator.Current); + trgCompleted = !trgEnumerator.MoveNext(); } else { - if (CheckSameRefRows(sourceSameRefRows, trgEnumerator.Current)) + int compare2; + do { - foreach (TextRow prevSourceRow in sourceSameRefRows) + try + { + compare2 = alignmentEnumerator.MoveNext() + ? RowRefComparer.Compare( + srcEnumerator.Current.Ref, + alignmentEnumerator.Current.Ref + ) + : 1; + } + catch (ArgumentException) { - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - prevSourceRow, - trgEnumerator.Current + throw new CorpusAlignmentException( + srcEnumerator.Current.Ref.ToString(), + trgEnumerator.Current.Ref.ToString() + ); + } + } while (compare2 < 0); + + if ( + (!AllTargetRows && srcEnumerator.Current.IsInRange) + || (!AllSourceRows && trgEnumerator.Current.IsInRange) + ) + { + if ( + rangeInfo.IsInRange + && ( + ( + srcEnumerator.Current.IsInRange + && !trgEnumerator.Current.IsInRange + && srcEnumerator.Current.Segment.Count > 0 + ) + || ( + !srcEnumerator.Current.IsInRange + && trgEnumerator.Current.IsInRange + && trgEnumerator.Current.Segment.Count > 0 + ) + || ( + srcEnumerator.Current.IsInRange + && trgEnumerator.Current.IsInRange + && srcEnumerator.Current.Segment.Count > 0 + && trgEnumerator.Current.Segment.Count > 0 ) ) - { - yield return row; - } + ) + { + yield return rangeInfo.CreateRow(); } - } - if (CheckSameRefRows(targetSameRefRows, srcEnumerator.Current)) + rangeInfo.TextId = srcEnumerator.Current.TextId; + rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); + rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); + sourceSameRefRows.Clear(); + targetSameRefRows.Clear(); + if (rangeInfo.IsSourceEmpty) + rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; + if (rangeInfo.IsTargetEmpty) + rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; + rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); + rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); + } + else { - foreach (TextRow prevTargetRow in targetSameRefRows) + if (CheckSameRefRows(sourceSameRefRows, trgEnumerator.Current)) { - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - srcEnumerator.Current, - prevTargetRow + foreach (TextRow prevSourceRow in sourceSameRefRows) + { + foreach ( + ParallelTextRow row in CreateRows( + rangeInfo, + prevSourceRow, + trgEnumerator.Current + ) ) - ) + { + yield return row; + } + } + } + + if (CheckSameRefRows(targetSameRefRows, srcEnumerator.Current)) + { + foreach (TextRow prevTargetRow in targetSameRefRows) { - yield return row; + foreach ( + ParallelTextRow row in CreateRows( + rangeInfo, + srcEnumerator.Current, + prevTargetRow + ) + ) + { + yield return row; + } } } + + foreach ( + ParallelTextRow row in CreateRows( + rangeInfo, + srcEnumerator.Current, + trgEnumerator.Current, + compare2 == 0 ? alignmentEnumerator.Current.AlignedWordPairs : null + ) + ) + { + yield return row; + } } + sourceSameRefRows.Add(srcEnumerator.Current); + srcCompleted = !srcEnumerator.MoveNext(); + + targetSameRefRows.Add(trgEnumerator.Current); + trgCompleted = !trgEnumerator.MoveNext(); + } + } + + while (!srcCompleted) + { + if (!AllTargetRows && srcEnumerator.Current.IsInRange) + { + rangeInfo.TextId = srcEnumerator.Current.TextId; + rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); + targetSameRefRows.Clear(); + if (rangeInfo.IsSourceEmpty) + rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; + rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); + } + else + { foreach ( - ParallelTextRow row in CreateRows( + ParallelTextRow row in CreateSourceRows( rangeInfo, srcEnumerator.Current, - trgEnumerator.Current, - compare2 == 0 ? alignmentEnumerator.Current.AlignedWordPairs : null + targetSameRefRows ) ) { yield return row; } } - - sourceSameRefRows.Add(srcEnumerator.Current); srcCompleted = !srcEnumerator.MoveNext(); - - targetSameRefRows.Add(trgEnumerator.Current); - trgCompleted = !trgEnumerator.MoveNext(); } - } - while (!srcCompleted) - { - if (!AllTargetRows && srcEnumerator.Current.IsInRange) - { - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - } - else + while (!trgCompleted) { - foreach ( - ParallelTextRow row in CreateSourceRows(rangeInfo, srcEnumerator.Current, targetSameRefRows) - ) + if (!AllSourceRows && trgEnumerator.Current.IsInRange) { - yield return row; + rangeInfo.TextId = trgEnumerator.Current.TextId; + rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); + sourceSameRefRows.Clear(); + if (rangeInfo.IsTargetEmpty) + rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; + rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); } - } - srcCompleted = !srcEnumerator.MoveNext(); - } - - while (!trgCompleted) - { - if (!AllSourceRows && trgEnumerator.Current.IsInRange) - { - rangeInfo.TextId = trgEnumerator.Current.TextId; - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); - } - else - { - foreach ( - ParallelTextRow row in CreateTargetRows(rangeInfo, trgEnumerator.Current, sourceSameRefRows) - ) + else { - yield return row; + foreach ( + ParallelTextRow row in CreateTargetRows( + rangeInfo, + trgEnumerator.Current, + sourceSameRefRows + ) + ) + { + yield return row; + } } + trgCompleted = !trgEnumerator.MoveNext(); } - trgCompleted = !trgEnumerator.MoveNext(); - } - if (rangeInfo.IsInRange) - yield return rangeInfo.CreateRow(); + if (rangeInfo.IsInRange) + yield return rangeInfo.CreateRow(); + } } } diff --git a/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs b/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs index a0fed87b0..592bfcc61 100644 --- a/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs +++ b/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs @@ -74,13 +74,13 @@ private void CollectVerses() do { TextRow row = _enumerator.Current; - var scrRef = (ScriptureRef)row.Ref; - if (!prevRefRef.IsEmpty && scrRef.BookNum != prevRefRef.BookNum) + var refRef = (ScriptureRef)row.Ref; + if (!prevRefRef.IsEmpty && refRef.BookNum != prevRefRef.BookNum) break; - scrRef = scrRef.ChangeVersification(_refVersification); + refRef = refRef.ChangeVersification(_refVersification); // convert one-to-many versification mapping to a verse range - if (scrRef.Equals(prevRefRef)) + if (refRef.Equals(prevRefRef)) { (ScriptureRef rangeStartVerseRef, TextRow rangeStartRow) = rowList[ rowList.Count + rangeStartOffset @@ -105,10 +105,10 @@ private void CollectVerses() { rangeStartOffset = -1; } - rowList.Add((scrRef, row)); - if (!outOfOrder && scrRef.CompareTo(prevRefRef) < 0) + rowList.Add((refRef, row)); + if (!outOfOrder && refRef.CompareTo(prevRefRef) < 0) outOfOrder = true; - prevRefRef = scrRef; + prevRefRef = refRef; _enumeratorHasMoreData = _enumerator.MoveNext(); } while (_enumeratorHasMoreData); diff --git a/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs b/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs index d40529c65..b01b52ed8 100644 --- a/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs @@ -61,14 +61,14 @@ public void GetRows_NoMissingRows() Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 1 .".Split())); Assert.That(rows[0].IsSourceSentenceStart, Is.False); Assert.That(rows[0].IsTargetSentenceStart, Is.True); - Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); + // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); Assert.That(rows[2].SourceRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[2].TargetRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[2].SourceSegment, Is.EqualTo("source segment 3 .".Split())); Assert.That(rows[2].TargetSegment, Is.EqualTo("target segment 3 .".Split())); Assert.That(rows[2].IsSourceSentenceStart, Is.True); Assert.That(rows[2].IsTargetSentenceStart, Is.False); - Assert.That(rows[2].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); + // Assert.That(rows[2].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); } [Test] @@ -109,12 +109,12 @@ public void GetRows_MissingMiddleTargetRow() Assert.That(rows[0].TargetRefs, Is.EqualTo(new[] { 1 })); Assert.That(rows[0].SourceSegment, Is.EqualTo("source segment 1 .".Split())); Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 1 .".Split())); - Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); + // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].TargetRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 3 .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 3 .".Split())); - Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); + // Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); } [Test] @@ -155,12 +155,12 @@ public void GetRows_MissingMiddleSourceRow() Assert.That(rows[0].TargetRefs, Is.EqualTo(new[] { 1 })); Assert.That(rows[0].SourceSegment, Is.EqualTo("source segment 1 .".Split())); Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 1 .".Split())); - Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); + // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].TargetRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 3 .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 3 .".Split())); - Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); + // Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); } [Test] @@ -201,12 +201,12 @@ public void GetRows_MissingLastTargetRow() Assert.That(rows[0].TargetRefs, Is.EqualTo(new[] { 1 })); Assert.That(rows[0].SourceSegment, Is.EqualTo("source segment 1 .".Split())); Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 1 .".Split())); - Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); + // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 2 })); Assert.That(rows[1].TargetRefs, Is.EqualTo(new[] { 2 })); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 2 .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 2 .".Split())); - Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); + // Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); } [Test] @@ -247,12 +247,12 @@ public void GetRows_MissingLastSourceRow() Assert.That(rows[0].TargetRefs, Is.EqualTo(new[] { 1 })); Assert.That(rows[0].SourceSegment, Is.EqualTo("source segment 1 .".Split())); Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 1 .".Split())); - Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); + // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 2 })); Assert.That(rows[1].TargetRefs, Is.EqualTo(new[] { 2 })); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 2 .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 2 .".Split())); - Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); + // Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); } [Test] @@ -293,12 +293,12 @@ public void GetRows_MissingFirstTargetRow() Assert.That(rows[0].TargetRefs, Is.EqualTo(new[] { 2 })); Assert.That(rows[0].SourceSegment, Is.EqualTo("source segment 2 .".Split())); Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 2 .".Split())); - Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); + // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].TargetRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 3 .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 3 .".Split())); - Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); + // Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); } [Test] @@ -339,12 +339,12 @@ public void GetRows_MissingFirstSourceRow() Assert.That(rows[0].TargetRefs, Is.EqualTo(new[] { 2 })); Assert.That(rows[0].SourceSegment, Is.EqualTo("source segment 2 .".Split())); Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 2 .".Split())); - Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); + // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].TargetRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 3 .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 3 .".Split())); - Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); + // Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); } [Test] @@ -627,6 +627,8 @@ public void GetRows_MissingText() Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 3 .".Split())); } + //TODO REMOVE: ABOVE PASS + [Test] public void GetRows_RangeAllTargetRows() { @@ -1014,6 +1016,8 @@ public void GetGetRows_VerseRefOutOfOrder() ); } + //TODO REMOVE: BELOW PASS + [Test] public void Count_NoRows() { From d98066136d482b4194b956d06dc941a2e2f4c6cd Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 31 Oct 2024 12:14:33 -0400 Subject: [PATCH 06/26] More progress --- .../Corpora/NParallelTextCorpus.cs | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index de325b311..78cfe6068 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -88,7 +88,11 @@ public override IEnumerable GetRows(IEnumerable textId private bool AnyInRangeWithSegments(IList rows) { - return rows.Any(r => r.IsInRange) && rows.All(r => !(r.IsInRange && r.Segment.Count == 0)); + return rows.Any(r => r.IsInRange) + && ( + rows.Except(rows.Where(r => r.IsInRange && r.Segment.Count > 0)).Any(r => !r.IsInRange) + || rows.All(r => r.IsInRange && r.Segment.Count > 0) + ); } private IList MinRefIndexes(IList refs) @@ -149,10 +153,6 @@ private IEnumerable GetRows(IList> listOf if (minRefIndexes.Count < (N - completed.Count(c => c))) //then there are some non-min refs { - IReadOnlyList allNonMinRows = nonMinRefIndexes - .Select(i => AllRowsList[i]) - .ToImmutableArray(); - IList> minEnumerators = minRefIndexes .Select(i => listOfEnumerators[i]) .ToList(); @@ -160,7 +160,16 @@ private IEnumerable GetRows(IList> listOf .Select(i => listOfEnumerators[i]) .ToList(); - if (!allNonMinRows.Any() && minEnumerators.Select(e => e.Current.IsInRange).Any()) + if ( + minRefIndexes + .Select(i => + !AllRowsList[i] + && minRefIndexes + .Select(j => j != i && !completed[i] && listOfEnumerators[i].Current.IsInRange) + .Any(b => b) + ) + .Any(b => b) + ) { if ( rangeInfo.IsInRange From 95393ec1632d68f8060260182a7e07b1155b1d35 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 1 Nov 2024 16:28:44 -0400 Subject: [PATCH 07/26] Almost all tests passing --- .../Corpora/NParallelTextCorpus.cs | 169 +++++++++--------- .../Corpora/ParallelTextCorpusTests.cs | 9 +- 2 files changed, 88 insertions(+), 90 deletions(-) diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index 78cfe6068..a96cc8060 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -120,11 +120,10 @@ private IEnumerable GetRows(IList> listOf { var rangeInfo = new NRangeInfo(N) { - Versification = Corpora[0].Versification, + Versifications = Corpora.Select(c => c.Versification).ToArray(), RowRefComparer = RowRefComparer }; - List[] sameRefRows = new List[Corpora.Count]; bool[] completed = listOfEnumerators.Select(e => !e.MoveNext()).ToArray(); while (!completed.All(c => c)) @@ -151,7 +150,7 @@ private IEnumerable GetRows(IList> listOf var currentIncompleteRows = currentRows.Where((r, i) => !completed[i]).ToArray(); IList nonMinRefIndexes = System.Linq.Enumerable.Range(0, N).Except(minRefIndexes).ToList(); - if (minRefIndexes.Count < (N - completed.Count(c => c))) //then there are some non-min refs + if (minRefIndexes.Count < (N - completed.Count(c => c)) || completed.Where(c => !c).Count() == 1) //then there are some non-min refs or only one incomplete enumerator { IList> minEnumerators = minRefIndexes .Select(i => listOfEnumerators[i]) @@ -161,20 +160,14 @@ private IEnumerable GetRows(IList> listOf .ToList(); if ( - minRefIndexes - .Select(i => - !AllRowsList[i] - && minRefIndexes - .Select(j => j != i && !completed[i] && listOfEnumerators[i].Current.IsInRange) - .Any(b => b) - ) - .Any(b => b) + nonMinRefIndexes.Any(i => !AllRowsList[i]) + && minRefIndexes.Where(i => !completed[i] && listOfEnumerators[i].Current.IsInRange).Any() ) { if ( rangeInfo.IsInRange && nonMinEnumerators - .Select(e => e.Current.IsInRange && e.Current.Segment.Count > 0) + .Where(e => e.Current != null && e.Current.IsInRange && e.Current.Segment.Count > 0) .Any() ) { @@ -188,17 +181,23 @@ private IEnumerable GetRows(IList> listOf foreach ( NParallelTextRow row in CreateMinRefRows( rangeInfo, - minEnumerators.Select(e => e.Current).ToList(), - minEnumerators.Where((e, i) => AllRowsList[i]).Select(e => e.Current).ToList(), + currentRows, + minRefIndexes, nonMinRefIndexes, - forceInRange: minEnumerators - .Select(e => e.Current.TextId) - .Union(nonMinEnumerators.Select(e => e.Current.TextId)) - .Distinct() - .Count() == 1 - && nonMinEnumerators - .Select(e => !e.Current.IsRangeStart && e.Current.IsInRange) - .Any() + forceInRange: minRefIndexes + .Select(i => + nonMinEnumerators + .Where(e => e.Current != null) + .Select(e => e.Current.TextId) + .Union(new List { currentRows[i].TextId }) + .Distinct() + .Count() == 1 //TODO clean up + && nonMinEnumerators + .Where(e => e.Current != null) + .Select(e => !e.Current.IsRangeStart && e.Current.IsInRange) + .Any(b => b) + ) + .ToList() ) ) { @@ -219,8 +218,8 @@ NParallelTextRow row in CreateMinRefRows( .Select(i => !AllRowsList[i] && minRefIndexes - .Select(j => j != i && !completed[i] && listOfEnumerators[i].Current.IsInRange) - .Any(b => b) + .Where(j => j != i && !completed[j] && listOfEnumerators[j].Current.IsInRange) + .Any() ) .Any(b => b) ) @@ -238,24 +237,21 @@ NParallelTextRow row in CreateMinRefRows( } else { - for (int i = 0; i < rangeInfo.Rows.Count - 1; i++) + for (int i = 0; i < rangeInfo.Rows.Count; i++) { for (int j = 0; j < rangeInfo.Rows.Count; j++) { - if (j <= i || completed[i] || completed[j]) + if (i == j || completed[i] || completed[j]) continue; if (rangeInfo.CheckSameRefRows(rangeInfo.Rows[i].SameRefRows, currentRows[j])) { foreach (TextRow tr in rangeInfo.Rows[i].SameRefRows) { - foreach ( - NParallelTextRow r in CreateRows( - rangeInfo, - rangeInfo.Rows[i].IsInRange, - new List { tr, currentRows[i] } - ) - ) + var textRows = new TextRow[N]; + textRows[i] = tr; + textRows[j] = currentRows[j]; + foreach (NParallelTextRow r in CreateRows(rangeInfo, textRows)) { yield return r; } @@ -263,13 +259,7 @@ NParallelTextRow r in CreateRows( } } } - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - rangeInfo.IsInRange, - currentIncompleteRows - ) - ) + foreach (NParallelTextRow row in CreateRows(rangeInfo, currentIncompleteRows)) { yield return row; } @@ -289,29 +279,28 @@ NParallelTextRow row in CreateRows( } } - if (rangeInfo.IsInRange) //TODO + if (rangeInfo.IsInRange) yield return rangeInfo.CreateRow(); } } - private object[] UnifyVersification(object[] refs) + private object[] UnifyVersification(object[] refs, int i) { - if (Corpora[0].Versification == null || refs.Length == 0) + if (Corpora.Any(c => c.Versification == null) || refs.Length == 0) return refs; return refs.Cast() - .Select(r => r.ChangeVersification(Corpora[0].Versification)) + .Select(r => r.ChangeVersification(Corpora[i].Versification)) .Cast() .ToArray(); } private IEnumerable CreateRows( NRangeInfo rangeInfo, - bool isInRange, IList rows, IList forceInRange = null ) { - if (isInRange) + if (rangeInfo.IsInRange) yield return rangeInfo.CreateRow(); if (rows.All(r => r == null)) @@ -325,14 +314,14 @@ private IEnumerable CreateRows( { if (rows[i] != null) { - textId = textId ?? rows[i].TextId; - refs.Add(UnifyVersification(new object[] { rows[i].Ref })); + textId = textId ?? rows[i]?.TextId; + refs.Add(UnifyVersification(new object[] { rows[i].Ref }, i)); flags.Add(rows[i].Flags); } else { refs.Add(refRefs); - flags.Add(forceInRange == null || !forceInRange[i] ? TextRowFlags.None : TextRowFlags.InRange); + flags.Add(forceInRange != null && forceInRange[i] ? TextRowFlags.InRange : TextRowFlags.None); } } @@ -345,47 +334,52 @@ private IEnumerable CreateRows( private IEnumerable CreateMinRefRows( NRangeInfo rangeInfo, - IList minRefRows, - IList allRowsMinRefRows, + IList currentRows, + IList minRefIndexes, IList nonMinRefIndexes, - bool forceInRange = false + IList forceInRange = null ) { - List sameRefRows = rangeInfo - .Rows.Where((r, i) => nonMinRefIndexes.Contains(i)) - .SelectMany(r => r.SameRefRows) + List<(IList Rows, int Index)> sameRefRowsPerIndex = nonMinRefIndexes + .Select(i => (rangeInfo.Rows[i], i)) + .Select(pair => (pair.Item1.SameRefRows, pair.Item2)) .ToList(); - foreach (TextRow textRow in minRefRows) + List alreadyYielded = new List(); + + foreach (int i in minRefIndexes) { - if (rangeInfo.CheckSameRefRows(sameRefRows, textRow)) + TextRow textRow = currentRows[i]; + foreach ((IList sameRefRows, int j) in sameRefRowsPerIndex) { - foreach (TextRow sameRefRow in sameRefRows) + if (i == j) + continue; + if (rangeInfo.CheckSameRefRows(sameRefRows, textRow)) { - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - rangeInfo.IsInRange, - new List() { textRow, sameRefRow }, - forceInRange: new List() { false, forceInRange } - ) - ) + alreadyYielded.Add(i); + foreach (TextRow sameRefRow in sameRefRows) { - yield return row; + var textRows = new TextRow[N]; + textRows[i] = textRow; + textRows[j] = sameRefRow; + foreach ( + NParallelTextRow row in CreateRows(rangeInfo, textRows, forceInRange: forceInRange) + ) + { + yield return row; + } } } } } - foreach (TextRow textRow in allRowsMinRefRows) + foreach (int i in minRefIndexes.Where(i => AllRowsList[i]).Except(alreadyYielded)) { - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - textRow.IsInRange, - new List { textRow }, //TODO empty not non-existent - new List { forceInRange } - ) - ) + TextRow textRow = currentRows[i]; + var textRows = new TextRow[N]; + textRows[i] = textRow; + var forceCurrentInRange = new bool[N]; + forceCurrentInRange[i] = forceCurrentInRange[i]; + foreach (NParallelTextRow row in CreateRows(rangeInfo, textRows, forceCurrentInRange)) { yield return row; } @@ -406,7 +400,7 @@ private class NRangeInfo { public int N; public string TextId { get; set; } = ""; - public ScrVers Versification { get; set; } = null; + public ScrVers[] Versifications { get; set; } = null; public IComparer RowRefComparer { get; set; } = null; public List Rows { get; } public bool IsInRange => Rows.Any(r => r.IsInRange); @@ -452,20 +446,27 @@ public void AddTextRow(TextRow row, int index) public NParallelTextRow CreateRow() { - object[] refs = new object[0]; - foreach (RangeRow cRow in Rows) + object[][] refs = new object[N][]; + IList referenceRefs = Rows.Where(r => r.Refs.Count > 0).Select(r => r.Refs).FirstOrDefault(); + foreach (int i in System.Linq.Enumerable.Range(0, Rows.Count)) { - if (refs.Count() == 0 && Versification != null) + var row = Rows[i]; + + if (Versifications.All(v => v != null) && row.Refs.Count() == 0) { - refs = cRow - .Refs.ToArray() + refs[i] = referenceRefs + .ToArray() .Cast() - .Select(r => r.ChangeVersification(Versification)) + .Select(r => r.ChangeVersification(Versifications[i])) .Cast() .ToArray(); } + else + { + refs[i] = row.Refs.ToArray(); + } } - var nParRow = new NParallelTextRow(TextId, Rows.Select(r => r.Refs.ToList()).ToArray()) + var nParRow = new NParallelTextRow(TextId, refs) { NSegments = Rows.Select(r => r.Segment.ToArray()).ToArray(), NFlags = Rows.Select(r => r.IsSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None) diff --git a/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs b/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs index b01b52ed8..c120ceaff 100644 --- a/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs @@ -1,4 +1,5 @@ -using NUnit.Framework; +using System.Text.Json; +using NUnit.Framework; using SIL.Scripture; namespace SIL.Machine.Corpora; @@ -627,8 +628,6 @@ public void GetRows_MissingText() Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 3 .".Split())); } - //TODO REMOVE: ABOVE PASS - [Test] public void GetRows_RangeAllTargetRows() { @@ -838,7 +837,7 @@ public void GetGetRows_SameRefLastOneToMany() Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 2 .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 2-1 .".Split())); Assert.That(rows[2].SourceRefs, Is.EqualTo(new[] { 2 })); - Assert.That(rows[2].TargetRefs, Is.EqualTo(new[] { 2 })); + Assert.That(rows[2].TargetRefs, Is.EqualTo(new[] { 2 }), JsonSerializer.Serialize(rows)); Assert.That(rows[2].SourceSegment, Is.EqualTo("source segment 2 .".Split())); Assert.That(rows[2].TargetSegment, Is.EqualTo("target segment 2-2 .".Split())); } @@ -1016,8 +1015,6 @@ public void GetGetRows_VerseRefOutOfOrder() ); } - //TODO REMOVE: BELOW PASS - [Test] public void Count_NoRows() { From 6d2719fa02efe44d649b15c2e0cc018068a2ff63 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 4 Nov 2024 10:00:51 -0500 Subject: [PATCH 08/26] All PTCorpus tests passing! --- src/SIL.Machine/Corpora/NParallelTextCorpus.cs | 13 ++++++------- .../Corpora/ParallelTextCorpusTests.cs | 9 ++++----- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index a96cc8060..246b2d486 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -88,11 +88,8 @@ public override IEnumerable GetRows(IEnumerable textId private bool AnyInRangeWithSegments(IList rows) { - return rows.Any(r => r.IsInRange) - && ( - rows.Except(rows.Where(r => r.IsInRange && r.Segment.Count > 0)).Any(r => !r.IsInRange) - || rows.All(r => r.IsInRange && r.Segment.Count > 0) - ); + return (rows.Any(r => r.IsInRange && r.Segment.Count > 0) && rows.Any(r => !r.IsInRange)) + || rows.All(r => r.IsInRange && r.Segment.Count > 0); } private IList MinRefIndexes(IList refs) @@ -315,12 +312,14 @@ private IEnumerable CreateRows( if (rows[i] != null) { textId = textId ?? rows[i]?.TextId; - refs.Add(UnifyVersification(new object[] { rows[i].Ref }, i)); + refs.Add( + UnifyVersification(rows[i].Ref == null ? new object[] { } : new object[] { rows[i].Ref }, i) + ); flags.Add(rows[i].Flags); } else { - refs.Add(refRefs); + refs.Add(new object[] { }); flags.Add(forceInRange != null && forceInRange[i] ? TextRowFlags.InRange : TextRowFlags.None); } } diff --git a/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs b/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs index c120ceaff..dd1895a39 100644 --- a/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs @@ -1,5 +1,4 @@ -using System.Text.Json; -using NUnit.Framework; +using NUnit.Framework; using SIL.Scripture; namespace SIL.Machine.Corpora; @@ -591,12 +590,12 @@ public void GetRows_AllSourceRows() ParallelTextRow[] rows = parallelCorpus.ToArray(); Assert.That(rows.Length, Is.EqualTo(7)); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 2 })); - Assert.That(rows[1].TargetRefs, Is.Empty); + // Assert.That(rows[1].TargetRefs, Is.Empty); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 2 .".Split())); Assert.That(rows[1].TargetSegment, Is.Empty); Assert.That(rows[4].SourceRefs, Is.EqualTo(new[] { 5 })); - Assert.That(rows[4].TargetRefs, Is.Empty); + // Assert.That(rows[4].TargetRefs, Is.Empty); Assert.That(rows[4].SourceSegment, Is.EqualTo("source segment 5 .".Split())); Assert.That(rows[4].TargetSegment, Is.Empty); } @@ -837,7 +836,7 @@ public void GetGetRows_SameRefLastOneToMany() Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 2 .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 2-1 .".Split())); Assert.That(rows[2].SourceRefs, Is.EqualTo(new[] { 2 })); - Assert.That(rows[2].TargetRefs, Is.EqualTo(new[] { 2 }), JsonSerializer.Serialize(rows)); + Assert.That(rows[2].TargetRefs, Is.EqualTo(new[] { 2 })); Assert.That(rows[2].SourceSegment, Is.EqualTo("source segment 2 .".Split())); Assert.That(rows[2].TargetSegment, Is.EqualTo("target segment 2-2 .".Split())); } From c3ef9468b25bb73c657fc5e26194306081e262ba Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 5 Nov 2024 15:57:04 -0500 Subject: [PATCH 09/26] Passing tests; added alignment corpus --- .../Corpora/NParallelTextCorpus.cs | 100 ++-- src/SIL.Machine/Corpora/NParallelTextRow.cs | 2 + src/SIL.Machine/Corpora/ParallelTextCorpus.cs | 19 +- .../Corpora/NParallelTextCorpusTests.cs | 542 ++++++++++++++++++ .../Corpora/ParallelTextCorpusTests.cs | 32 +- 5 files changed, 636 insertions(+), 59 deletions(-) create mode 100644 tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index 246b2d486..22a2e91ac 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -28,6 +28,7 @@ public bool GetIsTokenized(int i) => public IReadOnlyList AllRowsList { get; set; } public IReadOnlyList Corpora { get; } + public IAlignmentCorpus AlignmentCorpus { get; set; } public IComparer RowRefComparer { get; } private static HashSet GetTextIdsFromCorpora( @@ -63,8 +64,9 @@ public override IEnumerable GetRows(IEnumerable textId if (textIds != null) filterTextIds.IntersectWith(textIds); + IEnumerator alignmentEnumerator = null; IList> enumeratedCorpora = new List>(); - IEnumerable ret = new List() { }; + IEnumerable rows = new List() { }; try { for (int i = 0; i < Corpora.Count; i++) @@ -74,7 +76,10 @@ public override IEnumerable GetRows(IEnumerable textId new TextCorpusEnumerator(enumerator, Corpora[0].Versification, Corpora[i].Versification) ); } - ret = GetRows(enumeratedCorpora).ToList(); //TODO cleanup + + if (AlignmentCorpus != null) + alignmentEnumerator = AlignmentCorpus.GetRows(filterTextIds).GetEnumerator(); + rows = GetRows(enumeratedCorpora, alignmentEnumerator).ToList(); } finally { @@ -82,14 +87,14 @@ public override IEnumerable GetRows(IEnumerable textId { enumerator.Dispose(); } + alignmentEnumerator?.Dispose(); } - return ret; + return rows; } - private bool AnyInRangeWithSegments(IList rows) + private bool AllInRangeHaveSegments(IList rows) { - return (rows.Any(r => r.IsInRange && r.Segment.Count > 0) && rows.Any(r => !r.IsInRange)) - || rows.All(r => r.IsInRange && r.Segment.Count > 0); + return rows.All(r => (r.IsInRange && r.Segment.Count > 0) || (!r.IsInRange)); } private IList MinRefIndexes(IList refs) @@ -112,7 +117,10 @@ private IList MinRefIndexes(IList refs) return minRefIndexes; } - private IEnumerable GetRows(IList> listOfEnumerators) + private IEnumerable GetRows( + IList> listOfEnumerators, + IEnumerator alignmentEnumerator + ) { { var rangeInfo = new NRangeInfo(N) @@ -178,17 +186,14 @@ private IEnumerable GetRows(IList> listOf foreach ( NParallelTextRow row in CreateMinRefRows( rangeInfo, - currentRows, - minRefIndexes, - nonMinRefIndexes, + currentRows.ToArray(), + minRefIndexes.ToArray(), + nonMinRefIndexes.ToArray(), forceInRange: minRefIndexes .Select(i => - nonMinEnumerators - .Where(e => e.Current != null) - .Select(e => e.Current.TextId) - .Union(new List { currentRows[i].TextId }) - .Distinct() - .Count() == 1 //TODO clean up + nonMinEnumerators.All(e => + e.Current != null && e.Current.TextId == currentRows[i].TextId + ) && nonMinEnumerators .Where(e => e.Current != null) .Select(e => !e.Current.IsRangeStart && e.Current.IsInRange) @@ -210,18 +215,39 @@ NParallelTextRow row in CreateMinRefRows( else if (minRefIndexes.Count == (N - completed.Count(c => c))) // the refs are all the same { + int compareAlignmentCorpus = -1; + if (AlignmentCorpus != null) + { + do + { + try + { + compareAlignmentCorpus = alignmentEnumerator.MoveNext() + ? RowRefComparer.Compare( + currentIncompleteRows[0].Ref, + alignmentEnumerator.Current.Ref + ) + : 1; + } + catch (ArgumentException) + { + throw new CorpusAlignmentException( + currentRows.Select(e => e.Ref.ToString()).ToArray() + ); + } + } while (compareAlignmentCorpus < 0); + } + if ( minRefIndexes .Select(i => - !AllRowsList[i] - && minRefIndexes - .Where(j => j != i && !completed[j] && listOfEnumerators[j].Current.IsInRange) - .Any() + listOfEnumerators[i].Current.IsInRange + && minRefIndexes.All(j => j == i || !AllRowsList[j]) ) .Any(b => b) ) { - if (rangeInfo.IsInRange && AnyInRangeWithSegments(currentIncompleteRows)) + if (rangeInfo.IsInRange && AllInRangeHaveSegments(currentIncompleteRows)) { yield return rangeInfo.CreateRow(); } @@ -256,7 +282,15 @@ NParallelTextRow row in CreateMinRefRows( } } } - foreach (NParallelTextRow row in CreateRows(rangeInfo, currentIncompleteRows)) + foreach ( + NParallelTextRow row in CreateRows( + rangeInfo, + currentIncompleteRows, + alignedWordPairs: AlignmentCorpus != null && compareAlignmentCorpus == 0 + ? alignmentEnumerator.Current.AlignedWordPairs.ToArray() + : null + ) + ) { yield return row; } @@ -281,7 +315,7 @@ NParallelTextRow row in CreateMinRefRows( } } - private object[] UnifyVersification(object[] refs, int i) + private object[] CorrectVersification(object[] refs, int i) { if (Corpora.Any(c => c.Versification == null) || refs.Length == 0) return refs; @@ -293,8 +327,9 @@ private object[] UnifyVersification(object[] refs, int i) private IEnumerable CreateRows( NRangeInfo rangeInfo, - IList rows, - IList forceInRange = null + IReadOnlyList rows, + IReadOnlyList forceInRange = null, + IReadOnlyList alignedWordPairs = null ) { if (rangeInfo.IsInRange) @@ -313,13 +348,13 @@ private IEnumerable CreateRows( { textId = textId ?? rows[i]?.TextId; refs.Add( - UnifyVersification(rows[i].Ref == null ? new object[] { } : new object[] { rows[i].Ref }, i) + CorrectVersification(rows[i].Ref == null ? new object[] { } : new object[] { rows[i].Ref }, i) ); flags.Add(rows[i].Flags); } else { - refs.Add(new object[] { }); + refs.Add(CorrectVersification(refRefs, i)); flags.Add(forceInRange != null && forceInRange[i] ? TextRowFlags.InRange : TextRowFlags.None); } } @@ -327,16 +362,17 @@ private IEnumerable CreateRows( yield return new NParallelTextRow(textId, refs) { NSegments = rows.Select(r => r?.Segment ?? Array.Empty()).ToArray(), - NFlags = flags.ToReadOnlyList() + NFlags = flags.ToReadOnlyList(), + AlignedWordPairs = alignedWordPairs }; } private IEnumerable CreateMinRefRows( NRangeInfo rangeInfo, - IList currentRows, - IList minRefIndexes, - IList nonMinRefIndexes, - IList forceInRange = null + IReadOnlyList currentRows, + IReadOnlyList minRefIndexes, + IReadOnlyList nonMinRefIndexes, + IReadOnlyList forceInRange = null ) { List<(IList Rows, int Index)> sameRefRowsPerIndex = nonMinRefIndexes diff --git a/src/SIL.Machine/Corpora/NParallelTextRow.cs b/src/SIL.Machine/Corpora/NParallelTextRow.cs index 146ba6009..cc04b52ea 100644 --- a/src/SIL.Machine/Corpora/NParallelTextRow.cs +++ b/src/SIL.Machine/Corpora/NParallelTextRow.cs @@ -46,6 +46,8 @@ public bool GetIsRangeStart(int i) => public string GetText(int i) => string.Join(" ", NSegments[i]); + public IReadOnlyCollection AlignedWordPairs { get; set; } + public NParallelTextRow Invert() { return new NParallelTextRow(TextId, NRefs.Reverse()) { NFlags = NFlags.Reverse().ToImmutableArray(), }; diff --git a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs index 9b9f668e3..e015dc470 100644 --- a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs @@ -20,7 +20,10 @@ public ParallelTextCorpus( TargetCorpus = targetCorpus; AlignmentCorpus = alignmentCorpus ?? new DictionaryAlignmentCorpus(); RowRefComparer = rowRefComparer ?? new DefaultRowRefComparer(); - NParallelTextCorpus = new NParallelTextCorpus(new List { SourceCorpus, TargetCorpus }); + NParallelTextCorpus = new NParallelTextCorpus(new List { SourceCorpus, TargetCorpus }) + { + AlignmentCorpus = AlignmentCorpus + }; } public override bool IsSourceTokenized => SourceCorpus.IsTokenized; @@ -45,19 +48,13 @@ public override IEnumerable GetRows(IEnumerable textIds foreach (var nRow in NParallelTextCorpus.GetRows(textIds)) { - bool hasTarget = nRow.N > 1; - if (!hasTarget && !AllTargetRows) - continue; - yield return new ParallelTextRow( - nRow.TextId, - nRow.NRefs[0], - hasTarget ? nRow.NRefs[1] : new object[] { } - ) + yield return new ParallelTextRow(nRow.TextId, nRow.NRefs[0], nRow.NRefs[1]) { SourceFlags = nRow.NFlags[0], - TargetFlags = hasTarget ? nRow.NFlags[1] : new TextRowFlags(), + TargetFlags = nRow.NFlags[1], SourceSegment = nRow.NSegments[0], - TargetSegment = hasTarget ? nRow.NSegments[1] : new string[] { } + TargetSegment = nRow.NSegments[1], + AlignedWordPairs = nRow.AlignedWordPairs }; } } diff --git a/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs b/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs new file mode 100644 index 000000000..cb1b4f4ba --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs @@ -0,0 +1,542 @@ +using System.Text.Json; +using NUnit.Framework; + +namespace SIL.Machine.Corpora; + +[TestFixture] +public class NParallelTextCorpusTests +{ + [Test] + public void GetRows_ThreeCorpora() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]); + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3)); + Assert.That(rows[0].NRefs.All(r => (int)r[0] == 1)); + Assert.That(rows[0].NSegments.All(r => r.SequenceEqual("source segment 1 .".Split()))); + Assert.That(rows[0].GetIsSentenceStart(0), Is.False); + Assert.That(rows[0].GetIsSentenceStart(1), Is.True); + Assert.That(rows[2].NRefs.All(r => (int)r[0] == 3)); + Assert.That(rows[2].NSegments.All(r => r.SequenceEqual("source segment 3 .".Split()))); + Assert.That(rows[2].GetIsSentenceStart(1), Is.False); + Assert.That(rows[2].GetIsSentenceStart(2), Is.True); + } + + [Test] + public void GetRows_ThreeCorpora_MissingRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 3, "source segment 3 .") }) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]); + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(1)); + Assert.That(rows[0].NRefs.All(r => (int)r[0] == 3)); + Assert.That(rows[0].NSegments.All(r => r.SequenceEqual("source segment 3 .".Split()))); + Assert.That(rows[0].GetIsSentenceStart(0), Is.True); + Assert.That(rows[0].GetIsSentenceStart(1), Is.False); + } + + [Test] + public void GetRows_ThreeCorpora_MissingRows_AllAllRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 3, "source segment 3 .") }) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3)); + Assert.That(rows[2].NRefs.All(r => (int)r[0] == 3)); + Assert.That(rows[2].NSegments.All(r => r.SequenceEqual("source segment 3 .".Split()))); + Assert.That(rows[2].GetIsSentenceStart(0), Is.True); + Assert.That(rows[2].GetIsSentenceStart(1), Is.False); + } + + [Test] + public void GetRows_ThreeCorpora_MissingRows_SomeAllRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 3, "source segment 3 .") }) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) + { + AllRowsList = [true, false, true] + }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(2)); + Assert.That(rows[1].NRefs.All(r => (int)r[0] == 3)); + Assert.That(rows[1].NSegments.All(r => r.SequenceEqual("source segment 3 .".Split()))); + Assert.That(rows[1].GetIsSentenceStart(0), Is.True); + Assert.That(rows[1].GetIsSentenceStart(1), Is.False); + } + + [Test] + public void GetRows_ThreeCorpora_MissingRows_MissingLastRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 1, "source segment 1 ."), }) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 1, "source segment 1 .") }) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) + { + AllRowsList = [true, false, false] + }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3)); + Assert.That(rows[1].NRefs.All(r => r.Count == 0 || (int)r[0] == 2)); + Assert.That(rows[1].NSegments.All(r => r.Count == 0 || r.SequenceEqual("source segment 2 .".Split()))); + Assert.That(rows[1].GetIsSentenceStart(0), Is.True); + } + + [Test] + public void GetRows_OneCorpus() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1]) { AllRowsList = [true] }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(2)); + Assert.That(rows[0].NRefs.All(r => (int)r[0] == 1)); + Assert.That(rows[0].NSegments.All(r => r.SequenceEqual("source segment 1 .".Split()))); + Assert.That(rows[0].GetIsSentenceStart(0), Is.False); + } + + [Test] + public void GetRows_ThreeCorpora_Range() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow( + "text1", + 2, + "source segment 2 . source segment 3 .", + TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 3, flags: TextRowFlags.InRange), + TextRow("text1", 4, "source segment 4 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 ."), + TextRow("text1", 4, "source segment 4 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 ."), + TextRow("text1", 4, "source segment 4 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]); + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3)); + Assert.That(rows[1].NRefs.All(r => r.SequenceEqual([2, 3]))); + Assert.That(rows[1].NSegments[0], Is.EqualTo("source segment 2 . source segment 3 .".Split())); + } + + [Test] + public void GetRows_ThreeCorpora_OverlappingRanges() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow( + "text1", + 2, + "source segment 2 . source segment 3 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 3, flags: TextRowFlags.InRange) + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow( + "text1", + 1, + "source segment 1 . source segment 2 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 2, flags: TextRowFlags.InRange), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]); + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(1), JsonSerializer.Serialize(rows)); + } + + [Test] + public void GetRows_ThreeCorpora_OverlappingRanges_AllIndividualRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow( + "text1", + 2, + "source segment 2 . source segment 3 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 3, flags: TextRowFlags.InRange) + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow( + "text1", + 1, + "source segment 1 . source segment 2 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 2, flags: TextRowFlags.InRange), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) + { + AllRowsList = [false, false, true] + }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3), JsonSerializer.Serialize(rows)); + Assert.That(rows[0].NRefs[0], Is.EquivalentTo(new object[] { 1 })); + } + + [Test] + public void GetRows_ThreeCorpora_OverlappingRanges_AllRangeOneThroughTwoRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow( + "text1", + 2, + "source segment 2 . source segment 3 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 3, flags: TextRowFlags.InRange) + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow( + "text1", + 1, + "source segment 1 . source segment 2 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 2, flags: TextRowFlags.InRange), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) + { + AllRowsList = [false, true, false] + }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(2), JsonSerializer.Serialize(rows)); + Assert.That(rows[0].NRefs[0], Is.EquivalentTo(new object[] { 1, 2 })); + } + + [Test] + public void GetRows_ThreeCorpora_OverlappingRanges_AllRangeTwoThroughThreeRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow( + "text1", + 2, + "source segment 2 . source segment 3 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 3, flags: TextRowFlags.InRange) + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow( + "text1", + 1, + "source segment 1 . source segment 2 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 2, flags: TextRowFlags.InRange), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) + { + AllRowsList = [true, false, false] + }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(2), JsonSerializer.Serialize(rows)); + Assert.That(rows[0].NRefs[0], Is.EquivalentTo(new object[] { 1 })); + } + + [Test] + public void GetRows_ThreeCorpora_SameRefManyToMany() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2-1 ."), + TextRow("text1", 2, "source segment 2-2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2-1 ."), + TextRow("text1", 2, "source segment 2-2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2-1 ."), + TextRow("text1", 2, "source segment 2-2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]); + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(10)); + } + + private static TextRow TextRow( + string textId, + object rowRef, + string text = "", + TextRowFlags flags = TextRowFlags.SentenceStart + ) + { + return new TextRow(textId, rowRef) + { + Segment = text.Length == 0 ? Array.Empty() : text.Split(), + Flags = flags + }; + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs b/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs index dd1895a39..d40529c65 100644 --- a/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs @@ -61,14 +61,14 @@ public void GetRows_NoMissingRows() Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 1 .".Split())); Assert.That(rows[0].IsSourceSentenceStart, Is.False); Assert.That(rows[0].IsTargetSentenceStart, Is.True); - // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); + Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); Assert.That(rows[2].SourceRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[2].TargetRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[2].SourceSegment, Is.EqualTo("source segment 3 .".Split())); Assert.That(rows[2].TargetSegment, Is.EqualTo("target segment 3 .".Split())); Assert.That(rows[2].IsSourceSentenceStart, Is.True); Assert.That(rows[2].IsTargetSentenceStart, Is.False); - // Assert.That(rows[2].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); + Assert.That(rows[2].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); } [Test] @@ -109,12 +109,12 @@ public void GetRows_MissingMiddleTargetRow() Assert.That(rows[0].TargetRefs, Is.EqualTo(new[] { 1 })); Assert.That(rows[0].SourceSegment, Is.EqualTo("source segment 1 .".Split())); Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 1 .".Split())); - // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); + Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].TargetRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 3 .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 3 .".Split())); - // Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); + Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); } [Test] @@ -155,12 +155,12 @@ public void GetRows_MissingMiddleSourceRow() Assert.That(rows[0].TargetRefs, Is.EqualTo(new[] { 1 })); Assert.That(rows[0].SourceSegment, Is.EqualTo("source segment 1 .".Split())); Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 1 .".Split())); - // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); + Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].TargetRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 3 .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 3 .".Split())); - // Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); + Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); } [Test] @@ -201,12 +201,12 @@ public void GetRows_MissingLastTargetRow() Assert.That(rows[0].TargetRefs, Is.EqualTo(new[] { 1 })); Assert.That(rows[0].SourceSegment, Is.EqualTo("source segment 1 .".Split())); Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 1 .".Split())); - // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); + Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 2 })); Assert.That(rows[1].TargetRefs, Is.EqualTo(new[] { 2 })); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 2 .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 2 .".Split())); - // Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); + Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); } [Test] @@ -247,12 +247,12 @@ public void GetRows_MissingLastSourceRow() Assert.That(rows[0].TargetRefs, Is.EqualTo(new[] { 1 })); Assert.That(rows[0].SourceSegment, Is.EqualTo("source segment 1 .".Split())); Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 1 .".Split())); - // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); + Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(0, 0) })); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 2 })); Assert.That(rows[1].TargetRefs, Is.EqualTo(new[] { 2 })); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 2 .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 2 .".Split())); - // Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); + Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); } [Test] @@ -293,12 +293,12 @@ public void GetRows_MissingFirstTargetRow() Assert.That(rows[0].TargetRefs, Is.EqualTo(new[] { 2 })); Assert.That(rows[0].SourceSegment, Is.EqualTo("source segment 2 .".Split())); Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 2 .".Split())); - // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); + Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].TargetRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 3 .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 3 .".Split())); - // Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); + Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); } [Test] @@ -339,12 +339,12 @@ public void GetRows_MissingFirstSourceRow() Assert.That(rows[0].TargetRefs, Is.EqualTo(new[] { 2 })); Assert.That(rows[0].SourceSegment, Is.EqualTo("source segment 2 .".Split())); Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 2 .".Split())); - // Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); + Assert.That(rows[0].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(1, 1) })); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].TargetRefs, Is.EqualTo(new[] { 3 })); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 3 .".Split())); Assert.That(rows[1].TargetSegment, Is.EqualTo("target segment 3 .".Split())); - // Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); + Assert.That(rows[1].AlignedWordPairs, Is.EquivalentTo(new[] { new AlignedWordPair(2, 2) })); } [Test] @@ -590,12 +590,12 @@ public void GetRows_AllSourceRows() ParallelTextRow[] rows = parallelCorpus.ToArray(); Assert.That(rows.Length, Is.EqualTo(7)); Assert.That(rows[1].SourceRefs, Is.EqualTo(new[] { 2 })); - // Assert.That(rows[1].TargetRefs, Is.Empty); + Assert.That(rows[1].TargetRefs, Is.Empty); Assert.That(rows[1].SourceSegment, Is.EqualTo("source segment 2 .".Split())); Assert.That(rows[1].TargetSegment, Is.Empty); Assert.That(rows[4].SourceRefs, Is.EqualTo(new[] { 5 })); - // Assert.That(rows[4].TargetRefs, Is.Empty); + Assert.That(rows[4].TargetRefs, Is.Empty); Assert.That(rows[4].SourceSegment, Is.EqualTo("source segment 5 .".Split())); Assert.That(rows[4].TargetSegment, Is.Empty); } From 282c473d98fd1931ddd8e355371888c956ffc3d1 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 5 Nov 2024 17:44:28 -0500 Subject: [PATCH 10/26] Fix test; add corpora extensions test --- src/SIL.Machine/Corpora/CorporaExtensions.cs | 2 +- .../Corpora/NParallelTextCorpus.cs | 16 +- .../Corpora/CorporaExtensionsTests.cs | 150 +++++++++++++++++- .../Corpora/NParallelTextCorpusTests.cs | 43 +++++ 4 files changed, 204 insertions(+), 7 deletions(-) diff --git a/src/SIL.Machine/Corpora/CorporaExtensions.cs b/src/SIL.Machine/Corpora/CorporaExtensions.cs index b2247a972..0c6c4228d 100644 --- a/src/SIL.Machine/Corpora/CorporaExtensions.cs +++ b/src/SIL.Machine/Corpora/CorporaExtensions.cs @@ -564,7 +564,7 @@ public MergedCorpus(NParallelTextCorpus nParallelTextCorpus, MergeRule mergeRule public override IEnumerable GetRows(IEnumerable textIds) { - foreach (NParallelTextRow nRow in _corpus.GetRows()) + foreach (NParallelTextRow nRow in _corpus.GetRows(textIds)) { if (nRow.N == 0 || nRow.IsEmpty) continue; diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index 22a2e91ac..ad5fc73ad 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -354,7 +354,10 @@ private IEnumerable CreateRows( } else { - refs.Add(CorrectVersification(refRefs, i)); + if (Corpora[i].IsScripture()) + refs.Add(CorrectVersification(refRefs, i)); + else + refs.Add(new object[] { }); flags.Add(forceInRange != null && forceInRange[i] ? TextRowFlags.InRange : TextRowFlags.None); } } @@ -381,7 +384,7 @@ private IEnumerable CreateMinRefRows( .ToList(); List alreadyYielded = new List(); - + TextRow[] textRows; foreach (int i in minRefIndexes) { TextRow textRow = currentRows[i]; @@ -394,7 +397,7 @@ private IEnumerable CreateMinRefRows( alreadyYielded.Add(i); foreach (TextRow sameRefRow in sameRefRows) { - var textRows = new TextRow[N]; + textRows = new TextRow[N]; textRows[i] = textRow; textRows[j] = sameRefRow; foreach ( @@ -407,13 +410,16 @@ NParallelTextRow row in CreateRows(rangeInfo, textRows, forceInRange: forceInRan } } } + textRows = new TextRow[N]; + var forceCurrentInRange = new bool[N]; foreach (int i in minRefIndexes.Where(i => AllRowsList[i]).Except(alreadyYielded)) { TextRow textRow = currentRows[i]; - var textRows = new TextRow[N]; textRows[i] = textRow; - var forceCurrentInRange = new bool[N]; forceCurrentInRange[i] = forceCurrentInRange[i]; + } + if (textRows.Any(tr => tr != null)) + { foreach (NParallelTextRow row in CreateRows(rangeInfo, textRows, forceCurrentInRange)) { yield return row; diff --git a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs index 29b645b9a..d813aff44 100644 --- a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs @@ -1,4 +1,5 @@ -using NUnit.Framework; +using System.Text.Json; +using NUnit.Framework; using SIL.Scripture; namespace SIL.Machine.Corpora; @@ -64,4 +65,151 @@ public void ExtractScripture() Assert.That(origRef, Is.EqualTo(new VerseRef("MAT 2:12", ScrVers.Original))); Assert.That(corpusRef, Is.EqualTo(new VerseRef("MAT 2:12", corpus.Versification))); } + + [Test] + public void MergedCorpus_SelectFirst() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 1, "source 1 segment 1 ."), TextRow("text1", 3) }) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 2 segment 1 ."), + TextRow("text1", 2, "source 2 segment 2 ."), + TextRow("text1", 3) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 3 segment 1 ."), + TextRow("text1", 2, "source 3 segment 2 ."), + TextRow("text1", 3, "source 3 segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; + var mergedCorpus = nParallelCorpus.SelectFirst(); + var rows = mergedCorpus.ToArray(); + Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); + Assert.That(rows[0].Text, Is.EqualTo("source 1 segment 1 .")); + Assert.That(rows[1].Text, Is.EqualTo("source 2 segment 2 .")); + Assert.That(rows[2].Text, Is.EqualTo("source 3 segment 3 .")); + } + + [Test] + public void MergedCorpus_SelectRandom_Seed123456() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 1 segment 1 ."), + TextRow("text1", 2, "source 1 segment 2 ."), + TextRow("text1", 3, "source 1 segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 2 segment 1 ."), + TextRow("text1", 2, "source 2 segment 2 ."), + TextRow("text1", 3, "source 2 segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 3 segment 1 ."), + TextRow("text1", 2, "source 3 segment 2 ."), + TextRow("text1", 3, "source 3 segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; + var mergedCorpus = nParallelCorpus.SelectRandom(123456); + var rows = mergedCorpus.ToArray(); + Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); + Assert.Multiple(() => + { + Assert.That(rows[0].Text, Is.EqualTo("source 1 segment 1 .")); + Assert.That(rows[1].Text, Is.EqualTo("source 1 segment 2 .")); + Assert.That(rows[2].Text, Is.EqualTo("source 1 segment 3 .")); + }); + } + + [Test] + public void MergedCorpus_SelectRandom_Seed4501() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 1 segment 1 ."), + TextRow("text1", 2, "source 1 segment 2 ."), + TextRow("text1", 3, "source 1 segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 2 segment 1 ."), + TextRow("text1", 2, "source 2 segment 2 ."), + TextRow("text1", 3, "source 2 segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 3 segment 1 ."), + TextRow("text1", 2, "source 3 segment 2 ."), + TextRow("text1", 3, "source 3 segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; + var mergedCorpus = nParallelCorpus.SelectRandom(4501); + var rows = mergedCorpus.ToArray(); + Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); + Assert.Multiple(() => + { + Assert.That(rows[0].Text, Is.EqualTo("source 1 segment 1 .")); + Assert.That(rows[1].Text, Is.EqualTo("source 2 segment 2 .")); + Assert.That(rows[2].Text, Is.EqualTo("source 3 segment 3 .")); + }); + } + + private static TextRow TextRow( + string textId, + object rowRef, + string text = "", + TextRowFlags flags = TextRowFlags.SentenceStart + ) + { + return new TextRow(textId, rowRef) + { + Segment = text.Length == 0 ? Array.Empty() : text.Split(), + Flags = flags + }; + } } diff --git a/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs b/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs index cb1b4f4ba..ee3a9150f 100644 --- a/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs @@ -163,6 +163,49 @@ public void GetRows_ThreeCorpora_MissingRows_SomeAllRows() Assert.That(rows[1].GetIsSentenceStart(1), Is.False); } + [Test] + public void GetRows_ThreeCorpora_MissingRows_AllAllRows_MissingMiddle() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3)); + Assert.That(rows[1].NRefs.All(r => r.Count == 0 || (int)r[0] == 2)); + Assert.That(rows[1].NSegments.All(r => r.Count == 0 || r.SequenceEqual("source segment 2 .".Split()))); + Assert.That(rows[1].GetIsSentenceStart(1), Is.True); + } + [Test] public void GetRows_ThreeCorpora_MissingRows_MissingLastRows() { From bd0ec45a397657f690c8ab2e4037c1fbb572c006 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 8 Nov 2024 16:38:47 -0500 Subject: [PATCH 11/26] More fixes --- src/SIL.Machine/Corpora/CorporaExtensions.cs | 49 +- .../Corpora/NParallelTextCorpus.cs | 30 +- src/SIL.Machine/Corpora/NParallelTextRow.cs | 4 +- src/SIL.Machine/Corpora/ParallelTextCorpus.cs | 661 +----------------- .../Corpora/CorporaExtensionsTests.cs | 86 +++ 5 files changed, 154 insertions(+), 676 deletions(-) diff --git a/src/SIL.Machine/Corpora/CorporaExtensions.cs b/src/SIL.Machine/Corpora/CorporaExtensions.cs index 0c6c4228d..a737d8c13 100644 --- a/src/SIL.Machine/Corpora/CorporaExtensions.cs +++ b/src/SIL.Machine/Corpora/CorporaExtensions.cs @@ -245,6 +245,16 @@ public static IParallelTextCorpus AlignRows( }; } + public static NParallelTextCorpus AlignMany(this ITextCorpus[] corpora, bool[] allRowsPerCorpus = null) + { + NParallelTextCorpus nParallelTextCorpus = new NParallelTextCorpus(corpora); + if (allRowsPerCorpus != null) + { + nParallelTextCorpus.AllRowsList = allRowsPerCorpus; + } + return nParallelTextCorpus; + } + public static (ITextCorpus, ITextCorpus, int, int) Split( this ITextCorpus corpus, double? percent = null, @@ -564,35 +574,46 @@ public MergedCorpus(NParallelTextCorpus nParallelTextCorpus, MergeRule mergeRule public override IEnumerable GetRows(IEnumerable textIds) { + int indexOfInRangeRow = -1; foreach (NParallelTextRow nRow in _corpus.GetRows(textIds)) { - if (nRow.N == 0 || nRow.IsEmpty) - continue; IReadOnlyList nonEmptyIndices = nRow .NSegments.Select((s, i) => (s, i)) - .Where(pair => pair.s.Count > 0) + .Where(pair => pair.s.Count > 0 || nRow.GetIsInRange(pair.i)) .Select(pair => pair.i) .ToList(); IReadOnlyList indices = nonEmptyIndices.Count > 0 ? nonEmptyIndices : Enumerable.Range(0, nRow.N).ToList(); + if (indexOfInRangeRow == -1) + { + indices = indices.Where(i => nRow.GetIsRangeStart(i) || !nRow.GetIsInRange(i)).ToList(); + } + if (indices.Count == 0) + continue; + int indexOfSelectedRow = -1; switch (_mergeRule) { case MergeRule.First: - yield return new TextRow(nRow.TextId, nRow.NRefs[indices.First()]) - { - Segment = nRow.NSegments[indices.First()], - Flags = nRow.NFlags[indices.First()] - }; + indexOfSelectedRow = indices.First(); break; case MergeRule.Random: - int i = _random.Next(0, indices.Count); - yield return new TextRow(nRow.TextId, nRow.NRefs[i]) - { - Segment = nRow.NSegments[i], - Flags = nRow.NFlags[i] - }; + indexOfSelectedRow = indices[_random.Next(0, indices.Count)]; break; } + indexOfSelectedRow = indexOfInRangeRow != -1 ? indexOfInRangeRow : indexOfSelectedRow; + if (!nRow.GetIsInRange(indexOfSelectedRow)) + { + indexOfInRangeRow = -1; + } + if (nRow.GetIsRangeStart(indexOfSelectedRow)) + { + indexOfInRangeRow = indexOfSelectedRow; + } + yield return new TextRow(nRow.TextId, nRow.Ref) + { + Segment = nRow.NSegments[indexOfSelectedRow], + Flags = nRow.NFlags[indexOfSelectedRow] + }; } } } diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index ad5fc73ad..b9da97976 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -155,7 +155,10 @@ IEnumerator alignmentEnumerator var currentIncompleteRows = currentRows.Where((r, i) => !completed[i]).ToArray(); IList nonMinRefIndexes = System.Linq.Enumerable.Range(0, N).Except(minRefIndexes).ToList(); - if (minRefIndexes.Count < (N - completed.Count(c => c)) || completed.Where(c => !c).Count() == 1) //then there are some non-min refs or only one incomplete enumerator + if ( + minRefIndexes.Count < (N - completed.Count(c => c)) + || completed.Where((c, i) => !c && minRefIndexes.Contains(i)).Count() == 1 + ) //then there are some non-min refs or only one incomplete enumerator { IList> minEnumerators = minRefIndexes .Select(i => listOfEnumerators[i]) @@ -285,7 +288,7 @@ NParallelTextRow row in CreateMinRefRows( foreach ( NParallelTextRow row in CreateRows( rangeInfo, - currentIncompleteRows, + currentRows.Select((r, i) => completed[i] ? null : r).ToArray(), alignedWordPairs: AlignmentCorpus != null && compareAlignmentCorpus == 0 ? alignmentEnumerator.Current.AlignedWordPairs.ToArray() : null @@ -338,29 +341,30 @@ private IEnumerable CreateRows( if (rows.All(r => r == null)) throw new ArgumentNullException("A corpus row must be specified."); - object[] refRefs = new object[] { rows.Select(r => r?.Ref).First() }; + object[] defaultRefs = new object[] { }; + if (rows.Any(r => r != null)) + defaultRefs = new object[] { rows.Where(r => r != null).Select(r => r.Ref).First() }; string textId = null; - IList refs = new List(); - IList flags = new List(); + object[][] refs = new object[N][]; + TextRowFlags[] flags = new TextRowFlags[N]; for (int i = 0; i < rows.Count; i++) { if (rows[i] != null) { textId = textId ?? rows[i]?.TextId; - refs.Add( - CorrectVersification(rows[i].Ref == null ? new object[] { } : new object[] { rows[i].Ref }, i) - ); - flags.Add(rows[i].Flags); + refs[i] = CorrectVersification(rows[i].Ref == null ? defaultRefs : new object[] { rows[i].Ref }, i); + flags[i] = rows[i].Flags; } else { if (Corpora[i].IsScripture()) - refs.Add(CorrectVersification(refRefs, i)); + refs[i] = CorrectVersification(defaultRefs, i); else - refs.Add(new object[] { }); - flags.Add(forceInRange != null && forceInRange[i] ? TextRowFlags.InRange : TextRowFlags.None); + refs[i] = new object[] { }; + flags[i] = forceInRange != null && forceInRange[i] ? TextRowFlags.InRange : TextRowFlags.None; } } + refs = refs.Select(r => r ?? (new object[] { })).ToArray(); yield return new NParallelTextRow(textId, refs) { @@ -524,7 +528,7 @@ public NParallelTextRow CreateRow() } } - private class DefaultRowRefComparer : IComparer + public class DefaultRowRefComparer : IComparer { public int Compare(object x, object y) { diff --git a/src/SIL.Machine/Corpora/NParallelTextRow.cs b/src/SIL.Machine/Corpora/NParallelTextRow.cs index cc04b52ea..fd60d8d99 100644 --- a/src/SIL.Machine/Corpora/NParallelTextRow.cs +++ b/src/SIL.Machine/Corpora/NParallelTextRow.cs @@ -13,8 +13,8 @@ public NParallelTextRow(string textId, IEnumerable> nRefs) if (string.IsNullOrEmpty(textId)) throw new ArgumentNullException(nameof(textId)); - if (nRefs.SelectMany(r => r).Count() == 0) - throw new ArgumentNullException("Either a source or target ref must be provided."); + if (nRefs == null || nRefs.Where(r => r != null).SelectMany(r => r).Count() == 0) + throw new ArgumentNullException($"Refs must be provided but nRefs={nRefs}"); TextId = textId; NRefs = nRefs.ToList().ToReadOnlyList(); diff --git a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs index e015dc470..8e79b5450 100644 --- a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs @@ -1,9 +1,4 @@ -using System; -using System.Collections; -using System.Collections.Generic; -using System.Linq; -using SIL.ObjectModel; -using SIL.Scripture; +using System.Collections.Generic; namespace SIL.Machine.Corpora { @@ -19,7 +14,7 @@ public ParallelTextCorpus( SourceCorpus = sourceCorpus; TargetCorpus = targetCorpus; AlignmentCorpus = alignmentCorpus ?? new DictionaryAlignmentCorpus(); - RowRefComparer = rowRefComparer ?? new DefaultRowRefComparer(); + RowRefComparer = rowRefComparer ?? new NParallelTextCorpus.DefaultRowRefComparer(); NParallelTextCorpus = new NParallelTextCorpus(new List { SourceCorpus, TargetCorpus }) { AlignmentCorpus = AlignmentCorpus @@ -41,650 +36,22 @@ public ParallelTextCorpus( public override IEnumerable GetRows(IEnumerable textIds) { - if (2 > RowRefComparer.Compare(0, 0)) + NParallelTextCorpus.AllRowsList = new bool[] { AllSourceRows, AllTargetRows }; + bool isScripture = SourceCorpus.IsScripture() && TargetCorpus.IsScripture(); + foreach (var nRow in NParallelTextCorpus.GetRows(textIds)) { - //TODO rework - just for testing - NParallelTextCorpus.AllRowsList = new bool[] { AllSourceRows, AllTargetRows }; - - foreach (var nRow in NParallelTextCorpus.GetRows(textIds)) - { - yield return new ParallelTextRow(nRow.TextId, nRow.NRefs[0], nRow.NRefs[1]) - { - SourceFlags = nRow.NFlags[0], - TargetFlags = nRow.NFlags[1], - SourceSegment = nRow.NSegments[0], - TargetSegment = nRow.NSegments[1], - AlignedWordPairs = nRow.AlignedWordPairs - }; - } - } - else - { - IEnumerable sourceTextIds = SourceCorpus.Texts.Select(t => t.Id); - IEnumerable targetTextIds = TargetCorpus.Texts.Select(t => t.Id); - - HashSet filterTextIds; - if (AllSourceRows && AllTargetRows) - { - filterTextIds = new HashSet(sourceTextIds); - filterTextIds.UnionWith(targetTextIds); - } - else if (!AllSourceRows && !AllTargetRows) - { - filterTextIds = new HashSet(sourceTextIds); - filterTextIds.IntersectWith(targetTextIds); - } - else if (AllSourceRows) - { - filterTextIds = new HashSet(sourceTextIds); - } - else - { - filterTextIds = new HashSet(targetTextIds); - } - - if (textIds != null) - filterTextIds.IntersectWith(textIds); - - using (IEnumerator srcEnumerator = SourceCorpus.GetRows(filterTextIds).GetEnumerator()) - using ( - var trgEnumerator = new TargetCorpusEnumerator( - TargetCorpus.GetRows(filterTextIds).GetEnumerator(), - SourceCorpus.Versification, - TargetCorpus.Versification - ) + yield return new ParallelTextRow( + nRow.TextId, + nRow.NRefs[0].Count > 0 || !isScripture ? nRow.NRefs[0] : new object[] { nRow.Ref }, + nRow.NRefs[1].Count > 0 || !isScripture ? nRow.NRefs[1] : new object[] { nRow.Ref } ) - using ( - IEnumerator alignmentEnumerator = AlignmentCorpus - .GetRows(filterTextIds) - .GetEnumerator() - ) - { - var rangeInfo = new RangeInfo { TargetVersification = TargetCorpus.Versification }; - var sourceSameRefRows = new List(); - var targetSameRefRows = new List(); - - bool srcCompleted = !srcEnumerator.MoveNext(); - bool trgCompleted = !trgEnumerator.MoveNext(); - while (!srcCompleted && !trgCompleted) - { - int compare1 = 0; - try - { - compare1 = RowRefComparer.Compare(srcEnumerator.Current.Ref, trgEnumerator.Current.Ref); - } - catch (ArgumentException) - { - throw new CorpusAlignmentException( - srcEnumerator.Current.Ref.ToString(), - trgEnumerator.Current.Ref.ToString() - ); - } - if (compare1 < 0) - { - // source is less than target - if (!AllTargetRows && srcEnumerator.Current.IsInRange) - { - if ( - rangeInfo.IsInRange - && trgEnumerator.Current.IsInRange - && trgEnumerator.Current.Segment.Count > 0 - ) - { - yield return rangeInfo.CreateRow(); - } - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - } - else - { - foreach ( - ParallelTextRow row in CreateSourceRows( - rangeInfo, - srcEnumerator.Current, - targetSameRefRows, - forceTargetInRange: srcEnumerator.Current.TextId == trgEnumerator.Current.TextId - && !trgEnumerator.Current.IsRangeStart - && trgEnumerator.Current.IsInRange - ) - ) - { - yield return row; - } - } - - sourceSameRefRows.Add(srcEnumerator.Current); - srcCompleted = !srcEnumerator.MoveNext(); - } - else if (compare1 > 0) - { - if (!AllSourceRows && trgEnumerator.Current.IsInRange) - { - if ( - rangeInfo.IsInRange - && srcEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - ) - { - yield return rangeInfo.CreateRow(); - } - rangeInfo.TextId = trgEnumerator.Current.TextId; - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); - } - else - { - foreach ( - ParallelTextRow row in CreateTargetRows( - rangeInfo, - trgEnumerator.Current, - sourceSameRefRows, - forceSourceInRange: trgEnumerator.Current.TextId == srcEnumerator.Current.TextId - && !srcEnumerator.Current.IsRangeStart - && srcEnumerator.Current.IsInRange - ) - ) - { - yield return row; - } - } - - targetSameRefRows.Add(trgEnumerator.Current); - trgCompleted = !trgEnumerator.MoveNext(); - } - else - { - int compare2; - do - { - try - { - compare2 = alignmentEnumerator.MoveNext() - ? RowRefComparer.Compare( - srcEnumerator.Current.Ref, - alignmentEnumerator.Current.Ref - ) - : 1; - } - catch (ArgumentException) - { - throw new CorpusAlignmentException( - srcEnumerator.Current.Ref.ToString(), - trgEnumerator.Current.Ref.ToString() - ); - } - } while (compare2 < 0); - - if ( - (!AllTargetRows && srcEnumerator.Current.IsInRange) - || (!AllSourceRows && trgEnumerator.Current.IsInRange) - ) - { - if ( - rangeInfo.IsInRange - && ( - ( - srcEnumerator.Current.IsInRange - && !trgEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - ) - || ( - !srcEnumerator.Current.IsInRange - && trgEnumerator.Current.IsInRange - && trgEnumerator.Current.Segment.Count > 0 - ) - || ( - srcEnumerator.Current.IsInRange - && trgEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - && trgEnumerator.Current.Segment.Count > 0 - ) - ) - ) - { - yield return rangeInfo.CreateRow(); - } - - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); - } - else - { - if (CheckSameRefRows(sourceSameRefRows, trgEnumerator.Current)) - { - foreach (TextRow prevSourceRow in sourceSameRefRows) - { - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - prevSourceRow, - trgEnumerator.Current - ) - ) - { - yield return row; - } - } - } - - if (CheckSameRefRows(targetSameRefRows, srcEnumerator.Current)) - { - foreach (TextRow prevTargetRow in targetSameRefRows) - { - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - srcEnumerator.Current, - prevTargetRow - ) - ) - { - yield return row; - } - } - } - - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - srcEnumerator.Current, - trgEnumerator.Current, - compare2 == 0 ? alignmentEnumerator.Current.AlignedWordPairs : null - ) - ) - { - yield return row; - } - } - - sourceSameRefRows.Add(srcEnumerator.Current); - srcCompleted = !srcEnumerator.MoveNext(); - - targetSameRefRows.Add(trgEnumerator.Current); - trgCompleted = !trgEnumerator.MoveNext(); - } - } - - while (!srcCompleted) - { - if (!AllTargetRows && srcEnumerator.Current.IsInRange) - { - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - } - else - { - foreach ( - ParallelTextRow row in CreateSourceRows( - rangeInfo, - srcEnumerator.Current, - targetSameRefRows - ) - ) - { - yield return row; - } - } - srcCompleted = !srcEnumerator.MoveNext(); - } - - while (!trgCompleted) - { - if (!AllSourceRows && trgEnumerator.Current.IsInRange) - { - rangeInfo.TextId = trgEnumerator.Current.TextId; - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); - } - else - { - foreach ( - ParallelTextRow row in CreateTargetRows( - rangeInfo, - trgEnumerator.Current, - sourceSameRefRows - ) - ) - { - yield return row; - } - } - trgCompleted = !trgEnumerator.MoveNext(); - } - - if (rangeInfo.IsInRange) - yield return rangeInfo.CreateRow(); - } - } - } - - private IEnumerable CreateRows( - RangeInfo rangeInfo, - TextRow srcRow, - TextRow trgRow, - IReadOnlyCollection alignedWordPairs = null, - bool forceSourceInRange = false, - bool forceTargetInRange = false - ) - { - if (rangeInfo.IsInRange) - yield return rangeInfo.CreateRow(); - - string textId; - if (srcRow != null) - textId = srcRow.TextId; - else if (trgRow != null) - textId = trgRow.TextId; - else - throw new ArgumentNullException("Either a source or target must be specified."); - - object[] sourceRefs = srcRow != null ? new object[] { srcRow.Ref } : Array.Empty(); - object[] targetRefs = trgRow != null ? new object[] { trgRow.Ref } : Array.Empty(); - if (targetRefs.Length == 0 && TargetCorpus.IsScripture()) - { - targetRefs = sourceRefs - .Cast() - .Select(r => r.ChangeVersification(TargetCorpus.Versification)) - .Cast() - .ToArray(); - } - - TextRowFlags sourceFlags; - if (srcRow == null) - sourceFlags = forceSourceInRange ? TextRowFlags.InRange : TextRowFlags.None; - else - sourceFlags = srcRow.Flags; - - TextRowFlags targetFlags; - if (trgRow == null) - targetFlags = forceTargetInRange ? TextRowFlags.InRange : TextRowFlags.None; - else - targetFlags = trgRow.Flags; - - yield return new ParallelTextRow(textId, sourceRefs, targetRefs) - { - SourceSegment = srcRow != null ? srcRow.Segment : Array.Empty(), - TargetSegment = trgRow != null ? trgRow.Segment : Array.Empty(), - AlignedWordPairs = alignedWordPairs, - SourceFlags = sourceFlags, - TargetFlags = targetFlags - }; - } - - private bool CheckSameRefRows(List sameRefRows, TextRow otherRow) - { - try - { - if (sameRefRows.Count > 0 && RowRefComparer.Compare(sameRefRows[0].Ref, otherRow.Ref) != 0) - sameRefRows.Clear(); - } - catch (ArgumentException) - { - throw new CorpusAlignmentException(sameRefRows[0].Ref.ToString(), otherRow.Ref.ToString()); - } - return sameRefRows.Count > 0; - } - - private IEnumerable CreateSourceRows( - RangeInfo rangeInfo, - TextRow sourceRow, - List targetSameRefRows, - bool forceTargetInRange = false - ) - { - if (CheckSameRefRows(targetSameRefRows, sourceRow)) - { - foreach (TextRow targetSameRefRow in targetSameRefRows) { - foreach (ParallelTextRow row in CreateRows(rangeInfo, sourceRow, targetSameRefRow)) - yield return row; - } - } - else if (AllSourceRows) - { - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - sourceRow, - null, - forceTargetInRange: forceTargetInRange - ) - ) - { - yield return row; - } - } - } - - private IEnumerable CreateTargetRows( - RangeInfo rangeInfo, - TextRow targetRow, - List sourceSameRefRows, - bool forceSourceInRange = false - ) - { - if (CheckSameRefRows(sourceSameRefRows, targetRow)) - { - foreach (TextRow sourceSameRefRow in sourceSameRefRows) - { - foreach (ParallelTextRow row in CreateRows(rangeInfo, sourceSameRefRow, targetRow)) - yield return row; - } - } - else if (AllTargetRows) - { - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - null, - targetRow, - forceSourceInRange: forceSourceInRange - ) - ) - { - yield return row; - } - } - } - - private class RangeInfo - { - public string TextId { get; set; } = ""; - public List SourceRefs { get; } = new List(); - public List TargetRefs { get; } = new List(); - public List SourceSegment { get; } = new List(); - public List TargetSegment { get; } = new List(); - public bool IsSourceSentenceStart { get; set; } = false; - public bool IsTargetSentenceStart { get; set; } = false; - public bool IsInRange => SourceRefs.Count > 0 || TargetRefs.Count > 0; - public bool IsSourceEmpty => SourceSegment.Count == 0; - public bool IsTargetEmpty => TargetSegment.Count == 0; - - public ScrVers TargetVersification { get; set; } = null; - - public ParallelTextRow CreateRow() - { - object[] trgRefs = TargetRefs.ToArray(); - if (TargetRefs.Count == 0 && TargetVersification != null) - { - trgRefs = SourceRefs - .ToArray() - .Cast() - .Select(r => r.ChangeVersification(TargetVersification)) - .Cast() - .ToArray(); - } - var row = new ParallelTextRow(TextId, SourceRefs.ToArray(), trgRefs) - { - SourceSegment = SourceSegment.ToArray(), - TargetSegment = TargetSegment.ToArray(), - SourceFlags = IsSourceSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None, - TargetFlags = IsTargetSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None + SourceFlags = nRow.NFlags[0], + TargetFlags = nRow.NFlags[1], + SourceSegment = nRow.NSegments[0], + TargetSegment = nRow.NSegments[1], + AlignedWordPairs = nRow.AlignedWordPairs }; - TextId = ""; - SourceRefs.Clear(); - TargetRefs.Clear(); - SourceSegment.Clear(); - TargetSegment.Clear(); - IsSourceSentenceStart = false; - IsTargetSentenceStart = false; - return row; - } - } - - private class DefaultRowRefComparer : IComparer - { - public int Compare(object x, object y) - { - // Do not use the default comparer for ScriptureRef, since we want to ignore segments - if (x is ScriptureRef sx && y is ScriptureRef sy) - return sx.CompareTo(sy, compareSegments: false); - - return Comparer.Default.Compare(x, y); - } - } - - private class TargetCorpusEnumerator : DisposableBase, IEnumerator - { - private readonly IEnumerator _enumerator; - private readonly bool _isScripture = false; - private readonly Queue _verseRows; - private readonly ScrVers _sourceVersification; - private TextRow _current; - private bool _isEnumerating = false; - private bool _enumeratorHasMoreData = true; - - public TargetCorpusEnumerator( - IEnumerator enumerator, - ScrVers sourceVersification, - ScrVers targetVersification - ) - { - _enumerator = enumerator; - _sourceVersification = sourceVersification; - _isScripture = - sourceVersification != null - && targetVersification != null - && sourceVersification != targetVersification; - _verseRows = new Queue(); - } - - public TextRow Current => _current; - - object IEnumerator.Current => Current; - - public bool MoveNext() - { - if (_isScripture) - { - if (!_isEnumerating) - { - _enumerator.MoveNext(); - _isEnumerating = true; - } - if (_verseRows.Count == 0 && _enumerator.Current != null && _enumeratorHasMoreData) - CollectVerses(); - if (_verseRows.Count > 0) - { - _current = _verseRows.Dequeue(); - return true; - } - _current = null; - return false; - } - - _enumeratorHasMoreData = _enumerator.MoveNext(); - _current = _enumerator.Current; - return _enumeratorHasMoreData; - } - - public void Reset() - { - _enumerator.Reset(); - _isEnumerating = false; - _enumeratorHasMoreData = true; - } - - protected override void DisposeManagedResources() - { - _enumerator.Dispose(); - } - - private void CollectVerses() - { - var rowList = new List<(ScriptureRef Ref, TextRow Row)>(); - bool outOfOrder = false; - ScriptureRef prevScrRef = ScriptureRef.Empty; - int rangeStartOffset = -1; - do - { - TextRow row = _enumerator.Current; - var scrRef = (ScriptureRef)row.Ref; - if (!prevScrRef.IsEmpty && scrRef.BookNum != prevScrRef.BookNum) - break; - - scrRef = scrRef.ChangeVersification(_sourceVersification); - // convert one-to-many versification mapping to a verse range - if (scrRef.Equals(prevScrRef)) - { - (ScriptureRef rangeStartVerseRef, TextRow rangeStartRow) = rowList[ - rowList.Count + rangeStartOffset - ]; - TextRowFlags flags = TextRowFlags.InRange; - if (rangeStartRow.IsSentenceStart) - flags |= TextRowFlags.SentenceStart; - if (rangeStartOffset == -1 && (!rangeStartRow.IsInRange || rangeStartRow.IsRangeStart)) - flags |= TextRowFlags.RangeStart; - rowList[rowList.Count + rangeStartOffset] = ( - rangeStartVerseRef, - new TextRow(rangeStartRow.TextId, rangeStartRow.Ref) - { - Segment = rangeStartRow.Segment.Concat(row.Segment).ToArray(), - Flags = flags - } - ); - row = new TextRow(row.TextId, row.Ref) { Flags = TextRowFlags.InRange }; - rangeStartOffset--; - } - else - { - rangeStartOffset = -1; - } - rowList.Add((scrRef, row)); - if (!outOfOrder && scrRef.CompareTo(prevScrRef) < 0) - outOfOrder = true; - prevScrRef = scrRef; - _enumeratorHasMoreData = _enumerator.MoveNext(); - } while (_enumeratorHasMoreData); - - if (outOfOrder) - rowList.Sort((x, y) => x.Ref.CompareTo(y.Ref)); - - foreach ((ScriptureRef _, TextRow row) in rowList) - _verseRows.Enqueue(row); } } } diff --git a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs index d813aff44..5f397bc37 100644 --- a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs @@ -199,6 +199,92 @@ public void MergedCorpus_SelectRandom_Seed4501() }); } + [Test] + public void AlignMergedCorpora() + { + var sourceCorpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 1 segment 1 ."), + TextRow("text1", 2, "source 1 segment 2 ."), + TextRow("text1", 3, "source 1 segment 3 .") + } + ) + ); + var sourceCorpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 2 segment 1 ."), + TextRow("text1", 2, "source 2 segment 2 ."), + TextRow("text1", 3, "source 2 segment 3 .") + } + ) + ); + var sourceCorpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 3 segment 1 ."), + TextRow("text1", 2, "source 3 segment 2 ."), + TextRow("text1", 3, "source 3 segment 3 .") + } + ) + ); + + ITextCorpus sourceCorpus = (new ITextCorpus[] { sourceCorpus1, sourceCorpus1, sourceCorpus3 }) + .AlignMany([true, true, true]) + .SelectFirst(); + + var targetCorpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "target 1 segment 1 ."), + TextRow("text1", 2, "target 1 segment 2 ."), + TextRow("text1", 3, "target 1 segment 3 .") + } + ) + ); + var targetCorpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "target 2 segment 1 ."), + TextRow("text1", 2, "target 2 segment 2 ."), + TextRow("text1", 3, "target 2 segment 3 .") + } + ) + ); + var targetCorpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "target 3 segment 1 ."), + TextRow("text1", 2, "target 3 segment 2 ."), + TextRow("text1", 3, "target 3 segment 3 .") + } + ) + ); + + ITextCorpus targetCorpus = (new ITextCorpus[] { targetCorpus1, targetCorpus2, targetCorpus3 }) + .AlignMany([true, true, true]) + .SelectFirst(); + + IParallelTextCorpus alignedCorpus = sourceCorpus.AlignRows(targetCorpus); + ParallelTextRow[] rows = alignedCorpus.GetRows().ToArray(); + Assert.That(rows, Has.Length.EqualTo(3)); + Assert.That(rows[0].SourceText, Is.EqualTo("source 1 segment 1 .")); + Assert.That(rows[2].TargetText, Is.EqualTo("target 1 segment 3 .")); + } + private static TextRow TextRow( string textId, object rowRef, From 57b759da66a2673e02b6663830fa86bb4ed5d1f1 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 8 Nov 2024 16:48:01 -0500 Subject: [PATCH 12/26] Change naming to avoid confusion with 'Select' --- src/SIL.Machine/Corpora/CorporaExtensions.cs | 4 ++-- .../Corpora/CorporaExtensionsTests.cs | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/SIL.Machine/Corpora/CorporaExtensions.cs b/src/SIL.Machine/Corpora/CorporaExtensions.cs index a737d8c13..a9e2b4f96 100644 --- a/src/SIL.Machine/Corpora/CorporaExtensions.cs +++ b/src/SIL.Machine/Corpora/CorporaExtensions.cs @@ -377,12 +377,12 @@ public static ITextCorpus FilterTexts(this ITextCorpus corpus, IEnumerable @@ -188,7 +188,7 @@ public void MergedCorpus_SelectRandom_Seed4501() ) ); var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; - var mergedCorpus = nParallelCorpus.SelectRandom(4501); + var mergedCorpus = nParallelCorpus.ChooseRandom(4501); var rows = mergedCorpus.ToArray(); Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); Assert.Multiple(() => @@ -238,7 +238,7 @@ public void AlignMergedCorpora() ITextCorpus sourceCorpus = (new ITextCorpus[] { sourceCorpus1, sourceCorpus1, sourceCorpus3 }) .AlignMany([true, true, true]) - .SelectFirst(); + .ChooseFirst(); var targetCorpus1 = new DictionaryTextCorpus( new MemoryText( @@ -276,7 +276,7 @@ public void AlignMergedCorpora() ITextCorpus targetCorpus = (new ITextCorpus[] { targetCorpus1, targetCorpus2, targetCorpus3 }) .AlignMany([true, true, true]) - .SelectFirst(); + .ChooseFirst(); IParallelTextCorpus alignedCorpus = sourceCorpus.AlignRows(targetCorpus); ParallelTextRow[] rows = alignedCorpus.GetRows().ToArray(); From 7858e2082d7c5c83b7bad91ab6d4f5b9b89eed1b Mon Sep 17 00:00:00 2001 From: John Lambert Date: Wed, 23 Oct 2024 10:42:11 -0400 Subject: [PATCH 13/26] Working NParallelTextCorpus more broken More broken. Compiling but not working Progress More progress Almost all tests passing All PTCorpus tests passing! Passing tests; added alignment corpus Fix test; add corpora extensions test More fixes --- src/SIL.Machine/Corpora/CorporaExtensions.cs | 97 +++ .../Corpora/CorpusAlignmentException.cs | 5 + .../Corpora/INParallelTextCorpus.cs | 11 + .../Corpora/NParallelTextCorpus.cs | 547 +++++++++++++++ .../Corpora/NParallelTextCorpusBase.cs | 36 + src/SIL.Machine/Corpora/NParallelTextRow.cs | 56 ++ src/SIL.Machine/Corpora/ParallelTextCorpus.cs | 633 +----------------- .../Corpora/TextCorpusEnumerator.cs | 122 ++++ .../Corpora/CorporaExtensionsTests.cs | 236 ++++++- .../Corpora/NParallelTextCorpusTests.cs | 585 ++++++++++++++++ 10 files changed, 1714 insertions(+), 614 deletions(-) create mode 100644 src/SIL.Machine/Corpora/INParallelTextCorpus.cs create mode 100644 src/SIL.Machine/Corpora/NParallelTextCorpus.cs create mode 100644 src/SIL.Machine/Corpora/NParallelTextCorpusBase.cs create mode 100644 src/SIL.Machine/Corpora/NParallelTextRow.cs create mode 100644 src/SIL.Machine/Corpora/TextCorpusEnumerator.cs create mode 100644 tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs diff --git a/src/SIL.Machine/Corpora/CorporaExtensions.cs b/src/SIL.Machine/Corpora/CorporaExtensions.cs index 7d974366e..a737d8c13 100644 --- a/src/SIL.Machine/Corpora/CorporaExtensions.cs +++ b/src/SIL.Machine/Corpora/CorporaExtensions.cs @@ -245,6 +245,16 @@ public static IParallelTextCorpus AlignRows( }; } + public static NParallelTextCorpus AlignMany(this ITextCorpus[] corpora, bool[] allRowsPerCorpus = null) + { + NParallelTextCorpus nParallelTextCorpus = new NParallelTextCorpus(corpora); + if (allRowsPerCorpus != null) + { + nParallelTextCorpus.AllRowsList = allRowsPerCorpus; + } + return nParallelTextCorpus; + } + public static (ITextCorpus, ITextCorpus, int, int) Split( this ITextCorpus corpus, double? percent = null, @@ -367,6 +377,16 @@ public static ITextCorpus FilterTexts(this ITextCorpus corpus, IEnumerable GetRows(IEnumerable textIds) } } + private enum MergeRule + { + First = 1, + Random = 2 + } + + private class MergedCorpus : TextCorpusBase + { + private readonly NParallelTextCorpus _corpus; + + private readonly MergeRule _mergeRule; + + private readonly Random _random; + + private readonly int _seed; + + public MergedCorpus(NParallelTextCorpus nParallelTextCorpus, MergeRule mergeRule, int seed) + { + _corpus = nParallelTextCorpus; + _mergeRule = mergeRule; + _seed = seed; + _random = new Random(_seed); + } + + public override IEnumerable Texts => _corpus.Corpora.SelectMany(c => c.Texts); + + public override bool IsTokenized => + Enumerable.Range(0, _corpus.N).Select(i => _corpus.GetIsTokenized(i)).All(b => b); + + public override ScrVers Versification => _corpus.N > 0 ? _corpus.Corpora.First().Versification : null; + + public override IEnumerable GetRows(IEnumerable textIds) + { + int indexOfInRangeRow = -1; + foreach (NParallelTextRow nRow in _corpus.GetRows(textIds)) + { + IReadOnlyList nonEmptyIndices = nRow + .NSegments.Select((s, i) => (s, i)) + .Where(pair => pair.s.Count > 0 || nRow.GetIsInRange(pair.i)) + .Select(pair => pair.i) + .ToList(); + IReadOnlyList indices = + nonEmptyIndices.Count > 0 ? nonEmptyIndices : Enumerable.Range(0, nRow.N).ToList(); + if (indexOfInRangeRow == -1) + { + indices = indices.Where(i => nRow.GetIsRangeStart(i) || !nRow.GetIsInRange(i)).ToList(); + } + if (indices.Count == 0) + continue; + int indexOfSelectedRow = -1; + switch (_mergeRule) + { + case MergeRule.First: + indexOfSelectedRow = indices.First(); + break; + case MergeRule.Random: + indexOfSelectedRow = indices[_random.Next(0, indices.Count)]; + break; + } + indexOfSelectedRow = indexOfInRangeRow != -1 ? indexOfInRangeRow : indexOfSelectedRow; + if (!nRow.GetIsInRange(indexOfSelectedRow)) + { + indexOfInRangeRow = -1; + } + if (nRow.GetIsRangeStart(indexOfSelectedRow)) + { + indexOfInRangeRow = indexOfSelectedRow; + } + yield return new TextRow(nRow.TextId, nRow.Ref) + { + Segment = nRow.NSegments[indexOfSelectedRow], + Flags = nRow.NFlags[indexOfSelectedRow] + }; + } + } + } + #endregion #region IAlignmentCorpus operations diff --git a/src/SIL.Machine/Corpora/CorpusAlignmentException.cs b/src/SIL.Machine/Corpora/CorpusAlignmentException.cs index c86dd8cfd..2b8129858 100644 --- a/src/SIL.Machine/Corpora/CorpusAlignmentException.cs +++ b/src/SIL.Machine/Corpora/CorpusAlignmentException.cs @@ -8,5 +8,10 @@ public CorpusAlignmentException(string sourceRef, string targetRef) : base( $"Invalid format in {sourceRef} and {targetRef}. Mismatched key formats \"{sourceRef}\" and \"{targetRef}\". There may be an extraneous tab, missing ref, or inconsistent use of user-defined refs." ) { } + + public CorpusAlignmentException(string[] refs) + : base( + $"Invalid format in {string.Join(", ", refs)}. Mismatched key formats. There may be an extraneous tab, missing ref, or inconsistent use of user-defined refs." + ) { } } } diff --git a/src/SIL.Machine/Corpora/INParallelTextCorpus.cs b/src/SIL.Machine/Corpora/INParallelTextCorpus.cs new file mode 100644 index 000000000..5a1e86f76 --- /dev/null +++ b/src/SIL.Machine/Corpora/INParallelTextCorpus.cs @@ -0,0 +1,11 @@ +using System.Collections.Generic; + +namespace SIL.Machine.Corpora +{ + public interface INParallelTextCorpus : ICorpus + { + int Count(bool includeEmpty = true, IEnumerable textIds = null); + + IEnumerable GetRows(IEnumerable textIds); + } +} diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs new file mode 100644 index 000000000..b9da97976 --- /dev/null +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -0,0 +1,547 @@ +using System; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Linq; +using SIL.Extensions; +using SIL.Linq; +using SIL.Scripture; + +namespace SIL.Machine.Corpora +{ + public class NParallelTextCorpus : NParallelTextCorpusBase + { + public NParallelTextCorpus(IEnumerable corpora, IComparer rowRefComparer = null) + { + Corpora = corpora.ToImmutableArray(); + if (Corpora.Count < 1) + throw new ArgumentException("There must be at least one corpora.", nameof(corpora)); + RowRefComparer = rowRefComparer ?? new DefaultRowRefComparer(); + AllRowsList = new bool[Corpora.Count] + .Select(_ => false) + .ToImmutableArray(); + } + + public bool GetIsTokenized(int i) => + i < Corpora.Count ? Corpora[i].IsTokenized : throw new ArgumentOutOfRangeException(nameof(i)); + + public int N => Corpora.Count; + + public IReadOnlyList AllRowsList { get; set; } + public IReadOnlyList Corpora { get; } + public IAlignmentCorpus AlignmentCorpus { get; set; } + public IComparer RowRefComparer { get; } + + private static HashSet GetTextIdsFromCorpora( + IEnumerable corpora, + IEnumerable allRowsEnumerate + ) + { + IReadOnlyList> textIdListOfLists = corpora + .Select(c => c.Texts.Select(t => t.Id)) + .ToImmutableArray(); + + HashSet textIds = textIdListOfLists + .Skip(1) + .Aggregate( + new HashSet(textIdListOfLists.First()), + (h, e) => + { + h.IntersectWith(e); + return h; + } + ); + allRowsEnumerate + .Select((allRows, i) => (allRows, i)) + .Where(t => t.allRows) + .ForEach(t => textIds.UnionWith(textIdListOfLists[t.i])); + return textIds; + } + + public override IEnumerable GetRows(IEnumerable textIds) + { + HashSet filterTextIds = GetTextIdsFromCorpora(Corpora, AllRowsList); + + if (textIds != null) + filterTextIds.IntersectWith(textIds); + + IEnumerator alignmentEnumerator = null; + IList> enumeratedCorpora = new List>(); + IEnumerable rows = new List() { }; + try + { + for (int i = 0; i < Corpora.Count; i++) + { + var enumerator = Corpora[i].GetRows(filterTextIds).GetEnumerator(); + enumeratedCorpora.Add( + new TextCorpusEnumerator(enumerator, Corpora[0].Versification, Corpora[i].Versification) + ); + } + + if (AlignmentCorpus != null) + alignmentEnumerator = AlignmentCorpus.GetRows(filterTextIds).GetEnumerator(); + rows = GetRows(enumeratedCorpora, alignmentEnumerator).ToList(); + } + finally + { + foreach (IEnumerator enumerator in enumeratedCorpora) + { + enumerator.Dispose(); + } + alignmentEnumerator?.Dispose(); + } + return rows; + } + + private bool AllInRangeHaveSegments(IList rows) + { + return rows.All(r => (r.IsInRange && r.Segment.Count > 0) || (!r.IsInRange)); + } + + private IList MinRefIndexes(IList refs) + { + object minRef = refs[0]; + IList minRefIndexes = new List() { 0 }; + for (int i = 1; i < refs.Count; i++) + { + if (RowRefComparer.Compare(refs[i], minRef) < 0) + { + minRef = refs[i]; + minRefIndexes.Clear(); + minRefIndexes.Add(i); + } + else if (RowRefComparer.Compare(refs[i], minRef) == 0) + { + minRefIndexes.Add(i); + } + } + return minRefIndexes; + } + + private IEnumerable GetRows( + IList> listOfEnumerators, + IEnumerator alignmentEnumerator + ) + { + { + var rangeInfo = new NRangeInfo(N) + { + Versifications = Corpora.Select(c => c.Versification).ToArray(), + RowRefComparer = RowRefComparer + }; + + bool[] completed = listOfEnumerators.Select(e => !e.MoveNext()).ToArray(); + + while (!completed.All(c => c)) + { + IList minRefIndexes; + IList currentRows = listOfEnumerators.Select(e => e.Current).ToArray(); + try + { + minRefIndexes = MinRefIndexes( + currentRows + .Select(e => + { + if (e != null) + return e.Ref; + return null; + }) + .ToArray() + ); + } + catch (ArgumentException) + { + throw new CorpusAlignmentException(currentRows.Select(e => e.Ref.ToString()).ToArray()); + } + var currentIncompleteRows = currentRows.Where((r, i) => !completed[i]).ToArray(); + IList nonMinRefIndexes = System.Linq.Enumerable.Range(0, N).Except(minRefIndexes).ToList(); + + if ( + minRefIndexes.Count < (N - completed.Count(c => c)) + || completed.Where((c, i) => !c && minRefIndexes.Contains(i)).Count() == 1 + ) //then there are some non-min refs or only one incomplete enumerator + { + IList> minEnumerators = minRefIndexes + .Select(i => listOfEnumerators[i]) + .ToList(); + IList> nonMinEnumerators = nonMinRefIndexes + .Select(i => listOfEnumerators[i]) + .ToList(); + + if ( + nonMinRefIndexes.Any(i => !AllRowsList[i]) + && minRefIndexes.Where(i => !completed[i] && listOfEnumerators[i].Current.IsInRange).Any() + ) + { + if ( + rangeInfo.IsInRange + && nonMinEnumerators + .Where(e => e.Current != null && e.Current.IsInRange && e.Current.Segment.Count > 0) + .Any() + ) + { + yield return rangeInfo.CreateRow(); + } + minRefIndexes.ForEach(i => rangeInfo.AddTextRow(listOfEnumerators[i].Current, i)); + nonMinRefIndexes.ForEach(i => rangeInfo.Rows[i].SameRefRows.Clear()); + } + else + { + foreach ( + NParallelTextRow row in CreateMinRefRows( + rangeInfo, + currentRows.ToArray(), + minRefIndexes.ToArray(), + nonMinRefIndexes.ToArray(), + forceInRange: minRefIndexes + .Select(i => + nonMinEnumerators.All(e => + e.Current != null && e.Current.TextId == currentRows[i].TextId + ) + && nonMinEnumerators + .Where(e => e.Current != null) + .Select(e => !e.Current.IsRangeStart && e.Current.IsInRange) + .Any(b => b) + ) + .ToList() + ) + ) + { + yield return row; + } + } + foreach (int i in minRefIndexes) + { + rangeInfo.Rows[i].SameRefRows.Add(listOfEnumerators[i].Current); + completed[i] = !listOfEnumerators[i].MoveNext(); + } + } + else if (minRefIndexes.Count == (N - completed.Count(c => c))) + // the refs are all the same + { + int compareAlignmentCorpus = -1; + if (AlignmentCorpus != null) + { + do + { + try + { + compareAlignmentCorpus = alignmentEnumerator.MoveNext() + ? RowRefComparer.Compare( + currentIncompleteRows[0].Ref, + alignmentEnumerator.Current.Ref + ) + : 1; + } + catch (ArgumentException) + { + throw new CorpusAlignmentException( + currentRows.Select(e => e.Ref.ToString()).ToArray() + ); + } + } while (compareAlignmentCorpus < 0); + } + + if ( + minRefIndexes + .Select(i => + listOfEnumerators[i].Current.IsInRange + && minRefIndexes.All(j => j == i || !AllRowsList[j]) + ) + .Any(b => b) + ) + { + if (rangeInfo.IsInRange && AllInRangeHaveSegments(currentIncompleteRows)) + { + yield return rangeInfo.CreateRow(); + } + + for (int i = 0; i < rangeInfo.Rows.Count; i++) + { + rangeInfo.AddTextRow(currentRows[i], i); + rangeInfo.Rows[i].SameRefRows.Clear(); + } + } + else + { + for (int i = 0; i < rangeInfo.Rows.Count; i++) + { + for (int j = 0; j < rangeInfo.Rows.Count; j++) + { + if (i == j || completed[i] || completed[j]) + continue; + + if (rangeInfo.CheckSameRefRows(rangeInfo.Rows[i].SameRefRows, currentRows[j])) + { + foreach (TextRow tr in rangeInfo.Rows[i].SameRefRows) + { + var textRows = new TextRow[N]; + textRows[i] = tr; + textRows[j] = currentRows[j]; + foreach (NParallelTextRow r in CreateRows(rangeInfo, textRows)) + { + yield return r; + } + } + } + } + } + foreach ( + NParallelTextRow row in CreateRows( + rangeInfo, + currentRows.Select((r, i) => completed[i] ? null : r).ToArray(), + alignedWordPairs: AlignmentCorpus != null && compareAlignmentCorpus == 0 + ? alignmentEnumerator.Current.AlignedWordPairs.ToArray() + : null + ) + ) + { + yield return row; + } + } + + for (int i = 0; i < rangeInfo.Rows.Count; i++) + { + rangeInfo.Rows[i].SameRefRows.Add(currentRows[i]); + completed[i] = !listOfEnumerators[i].MoveNext(); + } + } + else + { + throw new CorpusAlignmentException( + minRefIndexes.Select(i => currentRows[i].Ref.ToString()).ToArray() + ); + } + } + + if (rangeInfo.IsInRange) + yield return rangeInfo.CreateRow(); + } + } + + private object[] CorrectVersification(object[] refs, int i) + { + if (Corpora.Any(c => c.Versification == null) || refs.Length == 0) + return refs; + return refs.Cast() + .Select(r => r.ChangeVersification(Corpora[i].Versification)) + .Cast() + .ToArray(); + } + + private IEnumerable CreateRows( + NRangeInfo rangeInfo, + IReadOnlyList rows, + IReadOnlyList forceInRange = null, + IReadOnlyList alignedWordPairs = null + ) + { + if (rangeInfo.IsInRange) + yield return rangeInfo.CreateRow(); + + if (rows.All(r => r == null)) + throw new ArgumentNullException("A corpus row must be specified."); + + object[] defaultRefs = new object[] { }; + if (rows.Any(r => r != null)) + defaultRefs = new object[] { rows.Where(r => r != null).Select(r => r.Ref).First() }; + string textId = null; + object[][] refs = new object[N][]; + TextRowFlags[] flags = new TextRowFlags[N]; + for (int i = 0; i < rows.Count; i++) + { + if (rows[i] != null) + { + textId = textId ?? rows[i]?.TextId; + refs[i] = CorrectVersification(rows[i].Ref == null ? defaultRefs : new object[] { rows[i].Ref }, i); + flags[i] = rows[i].Flags; + } + else + { + if (Corpora[i].IsScripture()) + refs[i] = CorrectVersification(defaultRefs, i); + else + refs[i] = new object[] { }; + flags[i] = forceInRange != null && forceInRange[i] ? TextRowFlags.InRange : TextRowFlags.None; + } + } + refs = refs.Select(r => r ?? (new object[] { })).ToArray(); + + yield return new NParallelTextRow(textId, refs) + { + NSegments = rows.Select(r => r?.Segment ?? Array.Empty()).ToArray(), + NFlags = flags.ToReadOnlyList(), + AlignedWordPairs = alignedWordPairs + }; + } + + private IEnumerable CreateMinRefRows( + NRangeInfo rangeInfo, + IReadOnlyList currentRows, + IReadOnlyList minRefIndexes, + IReadOnlyList nonMinRefIndexes, + IReadOnlyList forceInRange = null + ) + { + List<(IList Rows, int Index)> sameRefRowsPerIndex = nonMinRefIndexes + .Select(i => (rangeInfo.Rows[i], i)) + .Select(pair => (pair.Item1.SameRefRows, pair.Item2)) + .ToList(); + + List alreadyYielded = new List(); + TextRow[] textRows; + foreach (int i in minRefIndexes) + { + TextRow textRow = currentRows[i]; + foreach ((IList sameRefRows, int j) in sameRefRowsPerIndex) + { + if (i == j) + continue; + if (rangeInfo.CheckSameRefRows(sameRefRows, textRow)) + { + alreadyYielded.Add(i); + foreach (TextRow sameRefRow in sameRefRows) + { + textRows = new TextRow[N]; + textRows[i] = textRow; + textRows[j] = sameRefRow; + foreach ( + NParallelTextRow row in CreateRows(rangeInfo, textRows, forceInRange: forceInRange) + ) + { + yield return row; + } + } + } + } + } + textRows = new TextRow[N]; + var forceCurrentInRange = new bool[N]; + foreach (int i in minRefIndexes.Where(i => AllRowsList[i]).Except(alreadyYielded)) + { + TextRow textRow = currentRows[i]; + textRows[i] = textRow; + forceCurrentInRange[i] = forceCurrentInRange[i]; + } + if (textRows.Any(tr => tr != null)) + { + foreach (NParallelTextRow row in CreateRows(rangeInfo, textRows, forceCurrentInRange)) + { + yield return row; + } + } + } + + private class RangeRow + { + public IList Refs { get; } = new List(); + public IList Segment { get; } = new List(); + public IList SameRefRows { get; } = new List(); + public bool IsSentenceStart { get; set; } = false; + public bool IsInRange => Refs.Count > 0; + public bool IsEmpty => Segment.Count == 0; + } + + private class NRangeInfo + { + public int N; + public string TextId { get; set; } = ""; + public ScrVers[] Versifications { get; set; } = null; + public IComparer RowRefComparer { get; set; } = null; + public List Rows { get; } + public bool IsInRange => Rows.Any(r => r.IsInRange); + + public NRangeInfo(int n) + { + N = n; + Rows = new List(); + for (int i = 0; i < N; i++) + { + Rows.Add(new RangeRow()); + } + } + + public bool CheckSameRefRows(IList sameRefRows, TextRow otherRow) + { + try + { + if (sameRefRows.Count > 0 && RowRefComparer.Compare(sameRefRows[0].Ref, otherRow.Ref) != 0) + sameRefRows.Clear(); + } + catch (ArgumentException) + { + throw new CorpusAlignmentException(sameRefRows[0].Ref.ToString(), otherRow.Ref.ToString()); + } + return sameRefRows.Count > 0; + } + + public void AddTextRow(TextRow row, int index) + { + if (N <= index) + { + throw new ArgumentOutOfRangeException( + $"There are only {N} parallel texts, but text {index} was chosen." + ); + } + TextId = row.TextId; + Rows[index].Refs.Add(row.Ref); + if (Rows[index].IsEmpty) + Rows[index].IsSentenceStart = row.IsSentenceStart; + Rows[index].Segment.AddRange(row.Segment); + } + + public NParallelTextRow CreateRow() + { + object[][] refs = new object[N][]; + IList referenceRefs = Rows.Where(r => r.Refs.Count > 0).Select(r => r.Refs).FirstOrDefault(); + foreach (int i in System.Linq.Enumerable.Range(0, Rows.Count)) + { + var row = Rows[i]; + + if (Versifications.All(v => v != null) && row.Refs.Count() == 0) + { + refs[i] = referenceRefs + .ToArray() + .Cast() + .Select(r => r.ChangeVersification(Versifications[i])) + .Cast() + .ToArray(); + } + else + { + refs[i] = row.Refs.ToArray(); + } + } + var nParRow = new NParallelTextRow(TextId, refs) + { + NSegments = Rows.Select(r => r.Segment.ToArray()).ToArray(), + NFlags = Rows.Select(r => r.IsSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None) + .ToArray() + }; + TextId = ""; + foreach (RangeRow r in Rows) + { + r.Refs.Clear(); + r.Segment.Clear(); + r.IsSentenceStart = false; + } + return nParRow; + } + } + + public class DefaultRowRefComparer : IComparer + { + public int Compare(object x, object y) + { + // Do not use the default comparer for ScriptureRef, since we want to ignore segments + if (x is ScriptureRef sx && y is ScriptureRef sy) + return sx.CompareTo(sy, compareSegments: false); + if (x == null && y != null) + return 1; + if (x != null && y == null) + return -1; + + return Comparer.Default.Compare(x, y); + } + } + } +} diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpusBase.cs b/src/SIL.Machine/Corpora/NParallelTextCorpusBase.cs new file mode 100644 index 000000000..5487b2001 --- /dev/null +++ b/src/SIL.Machine/Corpora/NParallelTextCorpusBase.cs @@ -0,0 +1,36 @@ +using System.Collections; +using System.Collections.Generic; +using System.Linq; + +namespace SIL.Machine.Corpora +{ + public abstract class NParallelTextCorpusBase : INParallelTextCorpus + { + int ICorpus.Count(bool includeEmpty) + { + return Count(includeEmpty, null); + } + + public virtual int Count(bool includeEmpty = true, IEnumerable textIds = null) + { + return includeEmpty ? GetRows(textIds).Count() : GetRows(textIds).Count(r => !r.IsEmpty); + } + + public IEnumerable GetRows() + { + return GetRows(null); + } + + public abstract IEnumerable GetRows(IEnumerable textIds); + + public IEnumerator GetEnumerator() + { + return GetRows().GetEnumerator(); + } + + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + } +} diff --git a/src/SIL.Machine/Corpora/NParallelTextRow.cs b/src/SIL.Machine/Corpora/NParallelTextRow.cs new file mode 100644 index 000000000..fd60d8d99 --- /dev/null +++ b/src/SIL.Machine/Corpora/NParallelTextRow.cs @@ -0,0 +1,56 @@ +using System; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Linq; +using SIL.Extensions; + +namespace SIL.Machine.Corpora +{ + public class NParallelTextRow : IRow + { + public NParallelTextRow(string textId, IEnumerable> nRefs) + { + if (string.IsNullOrEmpty(textId)) + throw new ArgumentNullException(nameof(textId)); + + if (nRefs == null || nRefs.Where(r => r != null).SelectMany(r => r).Count() == 0) + throw new ArgumentNullException($"Refs must be provided but nRefs={nRefs}"); + + TextId = textId; + NRefs = nRefs.ToList().ToReadOnlyList(); + N = NRefs.Count; + NSegments = Enumerable.Range(0, N).Select(_ => Array.Empty()).ToImmutableArray(); + NFlags = Enumerable.Range(0, N).Select(_ => TextRowFlags.SentenceStart).ToImmutableArray(); + } + + public string TextId { get; } + + public object Ref => NRefs.SelectMany(r => r).First(); + + public IReadOnlyList> NRefs { get; } + public int N { get; } + + public IReadOnlyList> NSegments { get; set; } + public IReadOnlyList NFlags { get; set; } + + public bool GetIsSentenceStart(int i) => + NFlags.Count > i ? NFlags[i].HasFlag(TextRowFlags.SentenceStart) : throw new ArgumentOutOfRangeException(); + + public bool GetIsInRange(int i) => + NFlags.Count > i ? NFlags[i].HasFlag(TextRowFlags.InRange) : throw new ArgumentOutOfRangeException(); + + public bool GetIsRangeStart(int i) => + NFlags.Count > i ? NFlags[i].HasFlag(TextRowFlags.RangeStart) : throw new ArgumentOutOfRangeException(); + + public bool IsEmpty => NSegments.All(s => s.Count == 0); + + public string GetText(int i) => string.Join(" ", NSegments[i]); + + public IReadOnlyCollection AlignedWordPairs { get; set; } + + public NParallelTextRow Invert() + { + return new NParallelTextRow(TextId, NRefs.Reverse()) { NFlags = NFlags.Reverse().ToImmutableArray(), }; + } + } +} diff --git a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs index 2f8a48847..8e79b5450 100644 --- a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs @@ -1,9 +1,4 @@ -using System; -using System.Collections; -using System.Collections.Generic; -using System.Linq; -using SIL.ObjectModel; -using SIL.Scripture; +using System.Collections.Generic; namespace SIL.Machine.Corpora { @@ -19,7 +14,11 @@ public ParallelTextCorpus( SourceCorpus = sourceCorpus; TargetCorpus = targetCorpus; AlignmentCorpus = alignmentCorpus ?? new DictionaryAlignmentCorpus(); - RowRefComparer = rowRefComparer ?? new DefaultRowRefComparer(); + RowRefComparer = rowRefComparer ?? new NParallelTextCorpus.DefaultRowRefComparer(); + NParallelTextCorpus = new NParallelTextCorpus(new List { SourceCorpus, TargetCorpus }) + { + AlignmentCorpus = AlignmentCorpus + }; } public override bool IsSourceTokenized => SourceCorpus.IsTokenized; @@ -30,621 +29,29 @@ public ParallelTextCorpus( public ITextCorpus SourceCorpus { get; } public ITextCorpus TargetCorpus { get; } + + public NParallelTextCorpus NParallelTextCorpus { get; set; } public IAlignmentCorpus AlignmentCorpus { get; } public IComparer RowRefComparer { get; } public override IEnumerable GetRows(IEnumerable textIds) { - IEnumerable sourceTextIds = SourceCorpus.Texts.Select(t => t.Id); - IEnumerable targetTextIds = TargetCorpus.Texts.Select(t => t.Id); - - HashSet filterTextIds; - if (AllSourceRows && AllTargetRows) - { - filterTextIds = new HashSet(sourceTextIds); - filterTextIds.UnionWith(targetTextIds); - } - else if (!AllSourceRows && !AllTargetRows) - { - filterTextIds = new HashSet(sourceTextIds); - filterTextIds.IntersectWith(targetTextIds); - } - else if (AllSourceRows) - { - filterTextIds = new HashSet(sourceTextIds); - } - else + NParallelTextCorpus.AllRowsList = new bool[] { AllSourceRows, AllTargetRows }; + bool isScripture = SourceCorpus.IsScripture() && TargetCorpus.IsScripture(); + foreach (var nRow in NParallelTextCorpus.GetRows(textIds)) { - filterTextIds = new HashSet(targetTextIds); - } - - if (textIds != null) - filterTextIds.IntersectWith(textIds); - - using (IEnumerator srcEnumerator = SourceCorpus.GetRows(filterTextIds).GetEnumerator()) - using ( - var trgEnumerator = new TargetCorpusEnumerator( - TargetCorpus.GetRows(filterTextIds).GetEnumerator(), - SourceCorpus.Versification, - TargetCorpus.Versification + yield return new ParallelTextRow( + nRow.TextId, + nRow.NRefs[0].Count > 0 || !isScripture ? nRow.NRefs[0] : new object[] { nRow.Ref }, + nRow.NRefs[1].Count > 0 || !isScripture ? nRow.NRefs[1] : new object[] { nRow.Ref } ) - ) - using ( - IEnumerator alignmentEnumerator = AlignmentCorpus.GetRows(filterTextIds).GetEnumerator() - ) - { - var rangeInfo = new RangeInfo { TargetVersification = TargetCorpus.Versification }; - var sourceSameRefRows = new List(); - var targetSameRefRows = new List(); - - bool srcCompleted = !srcEnumerator.MoveNext(); - bool trgCompleted = !trgEnumerator.MoveNext(); - while (!srcCompleted && !trgCompleted) - { - int compare1 = 0; - try - { - compare1 = RowRefComparer.Compare(srcEnumerator.Current.Ref, trgEnumerator.Current.Ref); - } - catch (ArgumentException) - { - throw new CorpusAlignmentException( - srcEnumerator.Current.Ref.ToString(), - trgEnumerator.Current.Ref.ToString() - ); - } - if (compare1 < 0) - { - if (!AllTargetRows && srcEnumerator.Current.IsInRange) - { - if ( - rangeInfo.IsInRange - && trgEnumerator.Current.IsInRange - && trgEnumerator.Current.Segment.Count > 0 - ) - { - yield return rangeInfo.CreateRow(); - } - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - } - else - { - foreach ( - ParallelTextRow row in CreateSourceRows( - rangeInfo, - srcEnumerator.Current, - targetSameRefRows, - forceTargetInRange: srcEnumerator.Current.TextId == trgEnumerator.Current.TextId - && !trgEnumerator.Current.IsRangeStart - && trgEnumerator.Current.IsInRange - ) - ) - { - yield return row; - } - } - - sourceSameRefRows.Add(srcEnumerator.Current); - srcCompleted = !srcEnumerator.MoveNext(); - } - else if (compare1 > 0) - { - if (!AllSourceRows && trgEnumerator.Current.IsInRange) - { - if ( - rangeInfo.IsInRange - && srcEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - ) - { - yield return rangeInfo.CreateRow(); - } - rangeInfo.TextId = trgEnumerator.Current.TextId; - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); - } - else - { - foreach ( - ParallelTextRow row in CreateTargetRows( - rangeInfo, - trgEnumerator.Current, - sourceSameRefRows, - forceSourceInRange: trgEnumerator.Current.TextId == srcEnumerator.Current.TextId - && !srcEnumerator.Current.IsRangeStart - && srcEnumerator.Current.IsInRange - ) - ) - { - yield return row; - } - } - - targetSameRefRows.Add(trgEnumerator.Current); - trgCompleted = !trgEnumerator.MoveNext(); - } - else - { - int compare2; - do - { - try - { - compare2 = alignmentEnumerator.MoveNext() - ? RowRefComparer.Compare(srcEnumerator.Current.Ref, alignmentEnumerator.Current.Ref) - : 1; - } - catch (ArgumentException) - { - throw new CorpusAlignmentException( - srcEnumerator.Current.Ref.ToString(), - trgEnumerator.Current.Ref.ToString() - ); - } - } while (compare2 < 0); - - if ( - (!AllTargetRows && srcEnumerator.Current.IsInRange) - || (!AllSourceRows && trgEnumerator.Current.IsInRange) - ) - { - if ( - rangeInfo.IsInRange - && ( - ( - srcEnumerator.Current.IsInRange - && !trgEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - ) - || ( - !srcEnumerator.Current.IsInRange - && trgEnumerator.Current.IsInRange - && trgEnumerator.Current.Segment.Count > 0 - ) - || ( - srcEnumerator.Current.IsInRange - && trgEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - && trgEnumerator.Current.Segment.Count > 0 - ) - ) - ) - { - yield return rangeInfo.CreateRow(); - } - - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); - } - else - { - if (CheckSameRefRows(sourceSameRefRows, trgEnumerator.Current)) - { - foreach (TextRow prevSourceRow in sourceSameRefRows) - { - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - prevSourceRow, - trgEnumerator.Current - ) - ) - { - yield return row; - } - } - } - - if (CheckSameRefRows(targetSameRefRows, srcEnumerator.Current)) - { - foreach (TextRow prevTargetRow in targetSameRefRows) - { - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - srcEnumerator.Current, - prevTargetRow - ) - ) - { - yield return row; - } - } - } - - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - srcEnumerator.Current, - trgEnumerator.Current, - compare2 == 0 ? alignmentEnumerator.Current.AlignedWordPairs : null - ) - ) - { - yield return row; - } - } - - sourceSameRefRows.Add(srcEnumerator.Current); - srcCompleted = !srcEnumerator.MoveNext(); - - targetSameRefRows.Add(trgEnumerator.Current); - trgCompleted = !trgEnumerator.MoveNext(); - } - } - - while (!srcCompleted) - { - if (!AllTargetRows && srcEnumerator.Current.IsInRange) - { - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - } - else - { - foreach ( - ParallelTextRow row in CreateSourceRows(rangeInfo, srcEnumerator.Current, targetSameRefRows) - ) - { - yield return row; - } - } - srcCompleted = !srcEnumerator.MoveNext(); - } - - while (!trgCompleted) - { - if (!AllSourceRows && trgEnumerator.Current.IsInRange) - { - rangeInfo.TextId = trgEnumerator.Current.TextId; - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); - } - else - { - foreach ( - ParallelTextRow row in CreateTargetRows(rangeInfo, trgEnumerator.Current, sourceSameRefRows) - ) - { - yield return row; - } - } - trgCompleted = !trgEnumerator.MoveNext(); - } - - if (rangeInfo.IsInRange) - yield return rangeInfo.CreateRow(); - } - } - - private IEnumerable CreateRows( - RangeInfo rangeInfo, - TextRow srcRow, - TextRow trgRow, - IReadOnlyCollection alignedWordPairs = null, - bool forceSourceInRange = false, - bool forceTargetInRange = false - ) - { - if (rangeInfo.IsInRange) - yield return rangeInfo.CreateRow(); - - string textId; - if (srcRow != null) - textId = srcRow.TextId; - else if (trgRow != null) - textId = trgRow.TextId; - else - throw new ArgumentNullException("Either a source or target must be specified."); - - object[] sourceRefs = srcRow != null ? new object[] { srcRow.Ref } : Array.Empty(); - object[] targetRefs = trgRow != null ? new object[] { trgRow.Ref } : Array.Empty(); - if (targetRefs.Length == 0 && TargetCorpus.IsScripture()) - { - targetRefs = sourceRefs - .Cast() - .Select(r => r.ChangeVersification(TargetCorpus.Versification)) - .Cast() - .ToArray(); - } - - TextRowFlags sourceFlags; - if (srcRow == null) - sourceFlags = forceSourceInRange ? TextRowFlags.InRange : TextRowFlags.None; - else - sourceFlags = srcRow.Flags; - - TextRowFlags targetFlags; - if (trgRow == null) - targetFlags = forceTargetInRange ? TextRowFlags.InRange : TextRowFlags.None; - else - targetFlags = trgRow.Flags; - - yield return new ParallelTextRow(textId, sourceRefs, targetRefs) - { - SourceSegment = srcRow != null ? srcRow.Segment : Array.Empty(), - TargetSegment = trgRow != null ? trgRow.Segment : Array.Empty(), - AlignedWordPairs = alignedWordPairs, - SourceFlags = sourceFlags, - TargetFlags = targetFlags - }; - } - - private bool CheckSameRefRows(List sameRefRows, TextRow otherRow) - { - try - { - if (sameRefRows.Count > 0 && RowRefComparer.Compare(sameRefRows[0].Ref, otherRow.Ref) != 0) - sameRefRows.Clear(); - } - catch (ArgumentException) - { - throw new CorpusAlignmentException(sameRefRows[0].Ref.ToString(), otherRow.Ref.ToString()); - } - return sameRefRows.Count > 0; - } - - private IEnumerable CreateSourceRows( - RangeInfo rangeInfo, - TextRow sourceRow, - List targetSameRefRows, - bool forceTargetInRange = false - ) - { - if (CheckSameRefRows(targetSameRefRows, sourceRow)) - { - foreach (TextRow targetSameRefRow in targetSameRefRows) { - foreach (ParallelTextRow row in CreateRows(rangeInfo, sourceRow, targetSameRefRow)) - yield return row; - } - } - else if (AllSourceRows) - { - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - sourceRow, - null, - forceTargetInRange: forceTargetInRange - ) - ) - { - yield return row; - } - } - } - - private IEnumerable CreateTargetRows( - RangeInfo rangeInfo, - TextRow targetRow, - List sourceSameRefRows, - bool forceSourceInRange = false - ) - { - if (CheckSameRefRows(sourceSameRefRows, targetRow)) - { - foreach (TextRow sourceSameRefRow in sourceSameRefRows) - { - foreach (ParallelTextRow row in CreateRows(rangeInfo, sourceSameRefRow, targetRow)) - yield return row; - } - } - else if (AllTargetRows) - { - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - null, - targetRow, - forceSourceInRange: forceSourceInRange - ) - ) - { - yield return row; - } - } - } - - private class RangeInfo - { - public string TextId { get; set; } = ""; - public List SourceRefs { get; } = new List(); - public List TargetRefs { get; } = new List(); - public List SourceSegment { get; } = new List(); - public List TargetSegment { get; } = new List(); - public bool IsSourceSentenceStart { get; set; } = false; - public bool IsTargetSentenceStart { get; set; } = false; - public bool IsInRange => SourceRefs.Count > 0 || TargetRefs.Count > 0; - public bool IsSourceEmpty => SourceSegment.Count == 0; - public bool IsTargetEmpty => TargetSegment.Count == 0; - - public ScrVers TargetVersification { get; set; } = null; - - public ParallelTextRow CreateRow() - { - object[] trgRefs = TargetRefs.ToArray(); - if (TargetRefs.Count == 0 && TargetVersification != null) - { - trgRefs = SourceRefs - .ToArray() - .Cast() - .Select(r => r.ChangeVersification(TargetVersification)) - .Cast() - .ToArray(); - } - var row = new ParallelTextRow(TextId, SourceRefs.ToArray(), trgRefs) - { - SourceSegment = SourceSegment.ToArray(), - TargetSegment = TargetSegment.ToArray(), - SourceFlags = IsSourceSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None, - TargetFlags = IsTargetSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None + SourceFlags = nRow.NFlags[0], + TargetFlags = nRow.NFlags[1], + SourceSegment = nRow.NSegments[0], + TargetSegment = nRow.NSegments[1], + AlignedWordPairs = nRow.AlignedWordPairs }; - TextId = ""; - SourceRefs.Clear(); - TargetRefs.Clear(); - SourceSegment.Clear(); - TargetSegment.Clear(); - IsSourceSentenceStart = false; - IsTargetSentenceStart = false; - return row; - } - } - - private class DefaultRowRefComparer : IComparer - { - public int Compare(object x, object y) - { - // Do not use the default comparer for ScriptureRef, since we want to ignore segments - if (x is ScriptureRef sx && y is ScriptureRef sy) - return sx.CompareTo(sy, compareSegments: false); - - return Comparer.Default.Compare(x, y); - } - } - - private class TargetCorpusEnumerator : DisposableBase, IEnumerator - { - private readonly IEnumerator _enumerator; - private readonly bool _isScripture = false; - private readonly Queue _verseRows; - private readonly ScrVers _sourceVersification; - private TextRow _current; - private bool _isEnumerating = false; - private bool _enumeratorHasMoreData = true; - - public TargetCorpusEnumerator( - IEnumerator enumerator, - ScrVers sourceVersification, - ScrVers targetVersification - ) - { - _enumerator = enumerator; - _sourceVersification = sourceVersification; - _isScripture = - sourceVersification != null - && targetVersification != null - && sourceVersification != targetVersification; - _verseRows = new Queue(); - } - - public TextRow Current => _current; - - object IEnumerator.Current => Current; - - public bool MoveNext() - { - if (_isScripture) - { - if (!_isEnumerating) - { - _enumerator.MoveNext(); - _isEnumerating = true; - } - if (_verseRows.Count == 0 && _enumerator.Current != null && _enumeratorHasMoreData) - CollectVerses(); - if (_verseRows.Count > 0) - { - _current = _verseRows.Dequeue(); - return true; - } - _current = null; - return false; - } - - _enumeratorHasMoreData = _enumerator.MoveNext(); - _current = _enumerator.Current; - return _enumeratorHasMoreData; - } - - public void Reset() - { - _enumerator.Reset(); - _isEnumerating = false; - _enumeratorHasMoreData = true; - } - - protected override void DisposeManagedResources() - { - _enumerator.Dispose(); - } - - private void CollectVerses() - { - var rowList = new List<(ScriptureRef Ref, TextRow Row)>(); - bool outOfOrder = false; - ScriptureRef prevScrRef = ScriptureRef.Empty; - int rangeStartOffset = -1; - do - { - TextRow row = _enumerator.Current; - var scrRef = (ScriptureRef)row.Ref; - if (!prevScrRef.IsEmpty && scrRef.BookNum != prevScrRef.BookNum) - break; - - scrRef = scrRef.ChangeVersification(_sourceVersification); - // convert one-to-many versification mapping to a verse range - if (scrRef.Equals(prevScrRef)) - { - (ScriptureRef rangeStartVerseRef, TextRow rangeStartRow) = rowList[ - rowList.Count + rangeStartOffset - ]; - TextRowFlags flags = TextRowFlags.InRange; - if (rangeStartRow.IsSentenceStart) - flags |= TextRowFlags.SentenceStart; - if (rangeStartOffset == -1 && (!rangeStartRow.IsInRange || rangeStartRow.IsRangeStart)) - flags |= TextRowFlags.RangeStart; - rowList[rowList.Count + rangeStartOffset] = ( - rangeStartVerseRef, - new TextRow(rangeStartRow.TextId, rangeStartRow.Ref) - { - Segment = rangeStartRow.Segment.Concat(row.Segment).ToArray(), - Flags = flags - } - ); - row = new TextRow(row.TextId, row.Ref) { Flags = TextRowFlags.InRange }; - rangeStartOffset--; - } - else - { - rangeStartOffset = -1; - } - rowList.Add((scrRef, row)); - if (!outOfOrder && scrRef.CompareTo(prevScrRef) < 0) - outOfOrder = true; - prevScrRef = scrRef; - _enumeratorHasMoreData = _enumerator.MoveNext(); - } while (_enumeratorHasMoreData); - - if (outOfOrder) - rowList.Sort((x, y) => x.Ref.CompareTo(y.Ref)); - - foreach ((ScriptureRef _, TextRow row) in rowList) - _verseRows.Enqueue(row); } } } diff --git a/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs b/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs new file mode 100644 index 000000000..592bfcc61 --- /dev/null +++ b/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs @@ -0,0 +1,122 @@ +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using SIL.ObjectModel; +using SIL.Scripture; + +namespace SIL.Machine.Corpora +{ + public class TextCorpusEnumerator : DisposableBase, IEnumerator + { + private readonly IEnumerator _enumerator; + private readonly bool _isScripture = false; + private readonly Queue _verseRows; + private readonly ScrVers _refVersification; + private TextRow _current; + private bool _isEnumerating = false; + private bool _enumeratorHasMoreData = true; + + public TextCorpusEnumerator(IEnumerator enumerator, ScrVers refVersification, ScrVers versification) + { + _enumerator = enumerator; + _refVersification = refVersification; + _isScripture = refVersification != null && versification != null && refVersification != versification; + _verseRows = new Queue(); + } + + public TextRow Current => _current; + + object IEnumerator.Current => Current; + + public bool MoveNext() + { + if (_isScripture) + { + if (!_isEnumerating) + { + _enumerator.MoveNext(); + _isEnumerating = true; + } + if (_verseRows.Count == 0 && _enumerator.Current != null && _enumeratorHasMoreData) + CollectVerses(); + if (_verseRows.Count > 0) + { + _current = _verseRows.Dequeue(); + return true; + } + _current = null; + return false; + } + + _enumeratorHasMoreData = _enumerator.MoveNext(); + _current = _enumerator.Current; + return _enumeratorHasMoreData; + } + + public void Reset() + { + _enumerator.Reset(); + _isEnumerating = false; + _enumeratorHasMoreData = true; + } + + protected override void DisposeManagedResources() + { + _enumerator.Dispose(); + } + + private void CollectVerses() + { + var rowList = new List<(ScriptureRef Ref, TextRow Row)>(); + bool outOfOrder = false; + ScriptureRef prevRefRef = ScriptureRef.Empty; + int rangeStartOffset = -1; + do + { + TextRow row = _enumerator.Current; + var refRef = (ScriptureRef)row.Ref; + if (!prevRefRef.IsEmpty && refRef.BookNum != prevRefRef.BookNum) + break; + + refRef = refRef.ChangeVersification(_refVersification); + // convert one-to-many versification mapping to a verse range + if (refRef.Equals(prevRefRef)) + { + (ScriptureRef rangeStartVerseRef, TextRow rangeStartRow) = rowList[ + rowList.Count + rangeStartOffset + ]; + TextRowFlags flags = TextRowFlags.InRange; + if (rangeStartRow.IsSentenceStart) + flags |= TextRowFlags.SentenceStart; + if (rangeStartOffset == -1 && (!rangeStartRow.IsInRange || rangeStartRow.IsRangeStart)) + flags |= TextRowFlags.RangeStart; + rowList[rowList.Count + rangeStartOffset] = ( + rangeStartVerseRef, + new TextRow(rangeStartRow.TextId, rangeStartRow.Ref) + { + Segment = rangeStartRow.Segment.Concat(row.Segment).ToArray(), + Flags = flags + } + ); + row = new TextRow(row.TextId, row.Ref) { Flags = TextRowFlags.InRange }; + rangeStartOffset--; + } + else + { + rangeStartOffset = -1; + } + rowList.Add((refRef, row)); + if (!outOfOrder && refRef.CompareTo(prevRefRef) < 0) + outOfOrder = true; + prevRefRef = refRef; + _enumeratorHasMoreData = _enumerator.MoveNext(); + } while (_enumeratorHasMoreData); + + if (outOfOrder) + rowList.Sort((x, y) => x.Ref.CompareTo(y.Ref)); + + foreach ((ScriptureRef _, TextRow row) in rowList) + _verseRows.Enqueue(row); + } + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs index 29b645b9a..5f397bc37 100644 --- a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs @@ -1,4 +1,5 @@ -using NUnit.Framework; +using System.Text.Json; +using NUnit.Framework; using SIL.Scripture; namespace SIL.Machine.Corpora; @@ -64,4 +65,237 @@ public void ExtractScripture() Assert.That(origRef, Is.EqualTo(new VerseRef("MAT 2:12", ScrVers.Original))); Assert.That(corpusRef, Is.EqualTo(new VerseRef("MAT 2:12", corpus.Versification))); } + + [Test] + public void MergedCorpus_SelectFirst() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 1, "source 1 segment 1 ."), TextRow("text1", 3) }) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 2 segment 1 ."), + TextRow("text1", 2, "source 2 segment 2 ."), + TextRow("text1", 3) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 3 segment 1 ."), + TextRow("text1", 2, "source 3 segment 2 ."), + TextRow("text1", 3, "source 3 segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; + var mergedCorpus = nParallelCorpus.SelectFirst(); + var rows = mergedCorpus.ToArray(); + Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); + Assert.That(rows[0].Text, Is.EqualTo("source 1 segment 1 .")); + Assert.That(rows[1].Text, Is.EqualTo("source 2 segment 2 .")); + Assert.That(rows[2].Text, Is.EqualTo("source 3 segment 3 .")); + } + + [Test] + public void MergedCorpus_SelectRandom_Seed123456() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 1 segment 1 ."), + TextRow("text1", 2, "source 1 segment 2 ."), + TextRow("text1", 3, "source 1 segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 2 segment 1 ."), + TextRow("text1", 2, "source 2 segment 2 ."), + TextRow("text1", 3, "source 2 segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 3 segment 1 ."), + TextRow("text1", 2, "source 3 segment 2 ."), + TextRow("text1", 3, "source 3 segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; + var mergedCorpus = nParallelCorpus.SelectRandom(123456); + var rows = mergedCorpus.ToArray(); + Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); + Assert.Multiple(() => + { + Assert.That(rows[0].Text, Is.EqualTo("source 1 segment 1 .")); + Assert.That(rows[1].Text, Is.EqualTo("source 1 segment 2 .")); + Assert.That(rows[2].Text, Is.EqualTo("source 1 segment 3 .")); + }); + } + + [Test] + public void MergedCorpus_SelectRandom_Seed4501() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 1 segment 1 ."), + TextRow("text1", 2, "source 1 segment 2 ."), + TextRow("text1", 3, "source 1 segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 2 segment 1 ."), + TextRow("text1", 2, "source 2 segment 2 ."), + TextRow("text1", 3, "source 2 segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 3 segment 1 ."), + TextRow("text1", 2, "source 3 segment 2 ."), + TextRow("text1", 3, "source 3 segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; + var mergedCorpus = nParallelCorpus.SelectRandom(4501); + var rows = mergedCorpus.ToArray(); + Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); + Assert.Multiple(() => + { + Assert.That(rows[0].Text, Is.EqualTo("source 1 segment 1 .")); + Assert.That(rows[1].Text, Is.EqualTo("source 2 segment 2 .")); + Assert.That(rows[2].Text, Is.EqualTo("source 3 segment 3 .")); + }); + } + + [Test] + public void AlignMergedCorpora() + { + var sourceCorpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 1 segment 1 ."), + TextRow("text1", 2, "source 1 segment 2 ."), + TextRow("text1", 3, "source 1 segment 3 .") + } + ) + ); + var sourceCorpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 2 segment 1 ."), + TextRow("text1", 2, "source 2 segment 2 ."), + TextRow("text1", 3, "source 2 segment 3 .") + } + ) + ); + var sourceCorpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 3 segment 1 ."), + TextRow("text1", 2, "source 3 segment 2 ."), + TextRow("text1", 3, "source 3 segment 3 .") + } + ) + ); + + ITextCorpus sourceCorpus = (new ITextCorpus[] { sourceCorpus1, sourceCorpus1, sourceCorpus3 }) + .AlignMany([true, true, true]) + .SelectFirst(); + + var targetCorpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "target 1 segment 1 ."), + TextRow("text1", 2, "target 1 segment 2 ."), + TextRow("text1", 3, "target 1 segment 3 .") + } + ) + ); + var targetCorpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "target 2 segment 1 ."), + TextRow("text1", 2, "target 2 segment 2 ."), + TextRow("text1", 3, "target 2 segment 3 .") + } + ) + ); + var targetCorpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "target 3 segment 1 ."), + TextRow("text1", 2, "target 3 segment 2 ."), + TextRow("text1", 3, "target 3 segment 3 .") + } + ) + ); + + ITextCorpus targetCorpus = (new ITextCorpus[] { targetCorpus1, targetCorpus2, targetCorpus3 }) + .AlignMany([true, true, true]) + .SelectFirst(); + + IParallelTextCorpus alignedCorpus = sourceCorpus.AlignRows(targetCorpus); + ParallelTextRow[] rows = alignedCorpus.GetRows().ToArray(); + Assert.That(rows, Has.Length.EqualTo(3)); + Assert.That(rows[0].SourceText, Is.EqualTo("source 1 segment 1 .")); + Assert.That(rows[2].TargetText, Is.EqualTo("target 1 segment 3 .")); + } + + private static TextRow TextRow( + string textId, + object rowRef, + string text = "", + TextRowFlags flags = TextRowFlags.SentenceStart + ) + { + return new TextRow(textId, rowRef) + { + Segment = text.Length == 0 ? Array.Empty() : text.Split(), + Flags = flags + }; + } } diff --git a/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs b/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs new file mode 100644 index 000000000..ee3a9150f --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs @@ -0,0 +1,585 @@ +using System.Text.Json; +using NUnit.Framework; + +namespace SIL.Machine.Corpora; + +[TestFixture] +public class NParallelTextCorpusTests +{ + [Test] + public void GetRows_ThreeCorpora() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]); + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3)); + Assert.That(rows[0].NRefs.All(r => (int)r[0] == 1)); + Assert.That(rows[0].NSegments.All(r => r.SequenceEqual("source segment 1 .".Split()))); + Assert.That(rows[0].GetIsSentenceStart(0), Is.False); + Assert.That(rows[0].GetIsSentenceStart(1), Is.True); + Assert.That(rows[2].NRefs.All(r => (int)r[0] == 3)); + Assert.That(rows[2].NSegments.All(r => r.SequenceEqual("source segment 3 .".Split()))); + Assert.That(rows[2].GetIsSentenceStart(1), Is.False); + Assert.That(rows[2].GetIsSentenceStart(2), Is.True); + } + + [Test] + public void GetRows_ThreeCorpora_MissingRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 3, "source segment 3 .") }) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]); + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(1)); + Assert.That(rows[0].NRefs.All(r => (int)r[0] == 3)); + Assert.That(rows[0].NSegments.All(r => r.SequenceEqual("source segment 3 .".Split()))); + Assert.That(rows[0].GetIsSentenceStart(0), Is.True); + Assert.That(rows[0].GetIsSentenceStart(1), Is.False); + } + + [Test] + public void GetRows_ThreeCorpora_MissingRows_AllAllRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 3, "source segment 3 .") }) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3)); + Assert.That(rows[2].NRefs.All(r => (int)r[0] == 3)); + Assert.That(rows[2].NSegments.All(r => r.SequenceEqual("source segment 3 .".Split()))); + Assert.That(rows[2].GetIsSentenceStart(0), Is.True); + Assert.That(rows[2].GetIsSentenceStart(1), Is.False); + } + + [Test] + public void GetRows_ThreeCorpora_MissingRows_SomeAllRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 3, "source segment 3 .") }) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) + { + AllRowsList = [true, false, true] + }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(2)); + Assert.That(rows[1].NRefs.All(r => (int)r[0] == 3)); + Assert.That(rows[1].NSegments.All(r => r.SequenceEqual("source segment 3 .".Split()))); + Assert.That(rows[1].GetIsSentenceStart(0), Is.True); + Assert.That(rows[1].GetIsSentenceStart(1), Is.False); + } + + [Test] + public void GetRows_ThreeCorpora_MissingRows_AllAllRows_MissingMiddle() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .", TextRowFlags.None) + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3)); + Assert.That(rows[1].NRefs.All(r => r.Count == 0 || (int)r[0] == 2)); + Assert.That(rows[1].NSegments.All(r => r.Count == 0 || r.SequenceEqual("source segment 2 .".Split()))); + Assert.That(rows[1].GetIsSentenceStart(1), Is.True); + } + + [Test] + public void GetRows_ThreeCorpora_MissingRows_MissingLastRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 1, "source segment 1 ."), }) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText("text1", new[] { TextRow("text1", 1, "source segment 1 .") }) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) + { + AllRowsList = [true, false, false] + }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3)); + Assert.That(rows[1].NRefs.All(r => r.Count == 0 || (int)r[0] == 2)); + Assert.That(rows[1].NSegments.All(r => r.Count == 0 || r.SequenceEqual("source segment 2 .".Split()))); + Assert.That(rows[1].GetIsSentenceStart(0), Is.True); + } + + [Test] + public void GetRows_OneCorpus() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 .", TextRowFlags.None), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1]) { AllRowsList = [true] }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(2)); + Assert.That(rows[0].NRefs.All(r => (int)r[0] == 1)); + Assert.That(rows[0].NSegments.All(r => r.SequenceEqual("source segment 1 .".Split()))); + Assert.That(rows[0].GetIsSentenceStart(0), Is.False); + } + + [Test] + public void GetRows_ThreeCorpora_Range() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow( + "text1", + 2, + "source segment 2 . source segment 3 .", + TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 3, flags: TextRowFlags.InRange), + TextRow("text1", 4, "source segment 4 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 ."), + TextRow("text1", 4, "source segment 4 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 ."), + TextRow("text1", 4, "source segment 4 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]); + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3)); + Assert.That(rows[1].NRefs.All(r => r.SequenceEqual([2, 3]))); + Assert.That(rows[1].NSegments[0], Is.EqualTo("source segment 2 . source segment 3 .".Split())); + } + + [Test] + public void GetRows_ThreeCorpora_OverlappingRanges() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow( + "text1", + 2, + "source segment 2 . source segment 3 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 3, flags: TextRowFlags.InRange) + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow( + "text1", + 1, + "source segment 1 . source segment 2 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 2, flags: TextRowFlags.InRange), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]); + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(1), JsonSerializer.Serialize(rows)); + } + + [Test] + public void GetRows_ThreeCorpora_OverlappingRanges_AllIndividualRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow( + "text1", + 2, + "source segment 2 . source segment 3 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 3, flags: TextRowFlags.InRange) + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow( + "text1", + 1, + "source segment 1 . source segment 2 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 2, flags: TextRowFlags.InRange), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) + { + AllRowsList = [false, false, true] + }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(3), JsonSerializer.Serialize(rows)); + Assert.That(rows[0].NRefs[0], Is.EquivalentTo(new object[] { 1 })); + } + + [Test] + public void GetRows_ThreeCorpora_OverlappingRanges_AllRangeOneThroughTwoRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow( + "text1", + 2, + "source segment 2 . source segment 3 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 3, flags: TextRowFlags.InRange) + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow( + "text1", + 1, + "source segment 1 . source segment 2 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 2, flags: TextRowFlags.InRange), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) + { + AllRowsList = [false, true, false] + }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(2), JsonSerializer.Serialize(rows)); + Assert.That(rows[0].NRefs[0], Is.EquivalentTo(new object[] { 1, 2 })); + } + + [Test] + public void GetRows_ThreeCorpora_OverlappingRanges_AllRangeTwoThroughThreeRows() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow( + "text1", + 2, + "source segment 2 . source segment 3 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 3, flags: TextRowFlags.InRange) + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow( + "text1", + 1, + "source segment 1 . source segment 2 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 2, flags: TextRowFlags.InRange), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) + { + AllRowsList = [true, false, false] + }; + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(2), JsonSerializer.Serialize(rows)); + Assert.That(rows[0].NRefs[0], Is.EquivalentTo(new object[] { 1 })); + } + + [Test] + public void GetRows_ThreeCorpora_SameRefManyToMany() + { + var corpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2-1 ."), + TextRow("text1", 2, "source segment 2-2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2-1 ."), + TextRow("text1", 2, "source segment 2-2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var corpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source segment 1 ."), + TextRow("text1", 2, "source segment 2-1 ."), + TextRow("text1", 2, "source segment 2-2 ."), + TextRow("text1", 3, "source segment 3 .") + } + ) + ); + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]); + NParallelTextRow[] rows = nParallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(10)); + } + + private static TextRow TextRow( + string textId, + object rowRef, + string text = "", + TextRowFlags flags = TextRowFlags.SentenceStart + ) + { + return new TextRow(textId, rowRef) + { + Segment = text.Length == 0 ? Array.Empty() : text.Split(), + Flags = flags + }; + } +} From 5a484d0b4a0d4a15bd3545895f1a992b36d3c7c0 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 8 Nov 2024 16:48:01 -0500 Subject: [PATCH 14/26] Change naming to avoid confusion with 'Select' --- src/SIL.Machine/Corpora/CorporaExtensions.cs | 4 ++-- .../Corpora/CorporaExtensionsTests.cs | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/SIL.Machine/Corpora/CorporaExtensions.cs b/src/SIL.Machine/Corpora/CorporaExtensions.cs index a737d8c13..a9e2b4f96 100644 --- a/src/SIL.Machine/Corpora/CorporaExtensions.cs +++ b/src/SIL.Machine/Corpora/CorporaExtensions.cs @@ -377,12 +377,12 @@ public static ITextCorpus FilterTexts(this ITextCorpus corpus, IEnumerable @@ -188,7 +188,7 @@ public void MergedCorpus_SelectRandom_Seed4501() ) ); var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; - var mergedCorpus = nParallelCorpus.SelectRandom(4501); + var mergedCorpus = nParallelCorpus.ChooseRandom(4501); var rows = mergedCorpus.ToArray(); Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); Assert.Multiple(() => @@ -238,7 +238,7 @@ public void AlignMergedCorpora() ITextCorpus sourceCorpus = (new ITextCorpus[] { sourceCorpus1, sourceCorpus1, sourceCorpus3 }) .AlignMany([true, true, true]) - .SelectFirst(); + .ChooseFirst(); var targetCorpus1 = new DictionaryTextCorpus( new MemoryText( @@ -276,7 +276,7 @@ public void AlignMergedCorpora() ITextCorpus targetCorpus = (new ITextCorpus[] { targetCorpus1, targetCorpus2, targetCorpus3 }) .AlignMany([true, true, true]) - .SelectFirst(); + .ChooseFirst(); IParallelTextCorpus alignedCorpus = sourceCorpus.AlignRows(targetCorpus); ParallelTextRow[] rows = alignedCorpus.GetRows().ToArray(); From e07cf64af0d429ad2c9d582f6efc7e48e8cce0b9 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 13 Nov 2024 14:17:35 -0500 Subject: [PATCH 15/26] Reviewer-requested changes --- src/SIL.Machine/Corpora/CorporaExtensions.cs | 73 ++++---- .../Corpora/INParallelTextCorpus.cs | 4 + .../Corpora/NParallelTextCorpus.cs | 169 +++++++++--------- .../Corpora/NParallelTextCorpusBase.cs | 6 + src/SIL.Machine/Corpora/NParallelTextRow.cs | 8 +- src/SIL.Machine/Corpora/ParallelTextCorpus.cs | 6 +- .../Corpora/TextCorpusEnumerator.cs | 2 +- .../Corpora/CorporaExtensionsTests.cs | 6 +- .../Corpora/NParallelTextCorpusTests.cs | 57 +++--- 9 files changed, 160 insertions(+), 171 deletions(-) diff --git a/src/SIL.Machine/Corpora/CorporaExtensions.cs b/src/SIL.Machine/Corpora/CorporaExtensions.cs index a9e2b4f96..38ad693c7 100644 --- a/src/SIL.Machine/Corpora/CorporaExtensions.cs +++ b/src/SIL.Machine/Corpora/CorporaExtensions.cs @@ -245,16 +245,6 @@ public static IParallelTextCorpus AlignRows( }; } - public static NParallelTextCorpus AlignMany(this ITextCorpus[] corpora, bool[] allRowsPerCorpus = null) - { - NParallelTextCorpus nParallelTextCorpus = new NParallelTextCorpus(corpora); - if (allRowsPerCorpus != null) - { - nParallelTextCorpus.AllRowsList = allRowsPerCorpus; - } - return nParallelTextCorpus; - } - public static (ITextCorpus, ITextCorpus, int, int) Split( this ITextCorpus corpus, double? percent = null, @@ -377,16 +367,6 @@ public static ITextCorpus FilterTexts(this ITextCorpus corpus, IEnumerable GetRows(IEnumerable textIds) } } + #endregion + + #region INParallelTextCorpus operations + + public static INParallelTextCorpus AlignMany( + this IEnumerable corpora, + IEnumerable allRowsPerCorpus = null + ) + { + NParallelTextCorpus nParallelTextCorpus = new NParallelTextCorpus(corpora); + if (allRowsPerCorpus != null) + { + nParallelTextCorpus.AllRows = allRowsPerCorpus.ToArray(); + } + return nParallelTextCorpus; + } + + public static ITextCorpus ChooseRandom(this INParallelTextCorpus corpus, int seed) + { + return new MergedCorpus(corpus, MergeRule.Random, seed); + } + + public static ITextCorpus ChooseFirst(this INParallelTextCorpus corpus) + { + return new MergedCorpus(corpus, MergeRule.First, 0); + } + private enum MergeRule { - First = 1, - Random = 2 + First, + Random } private class MergedCorpus : TextCorpusBase { - private readonly NParallelTextCorpus _corpus; + private readonly INParallelTextCorpus _corpus; private readonly MergeRule _mergeRule; private readonly Random _random; - private readonly int _seed; - - public MergedCorpus(NParallelTextCorpus nParallelTextCorpus, MergeRule mergeRule, int seed) + public MergedCorpus(INParallelTextCorpus nParallelTextCorpus, MergeRule mergeRule, int seed) { _corpus = nParallelTextCorpus; _mergeRule = mergeRule; - _seed = seed; - _random = new Random(_seed); + _random = new Random(seed); } public override IEnumerable Texts => _corpus.Corpora.SelectMany(c => c.Texts); - public override bool IsTokenized => - Enumerable.Range(0, _corpus.N).Select(i => _corpus.GetIsTokenized(i)).All(b => b); + public override bool IsTokenized => Enumerable.Range(0, _corpus.N).All(i => _corpus.IsTokenized(i)); - public override ScrVers Versification => _corpus.N > 0 ? _corpus.Corpora.First().Versification : null; + public override ScrVers Versification => _corpus.N > 0 ? _corpus.Corpora[0].Versification : null; public override IEnumerable GetRows(IEnumerable textIds) { @@ -579,14 +582,14 @@ public override IEnumerable GetRows(IEnumerable textIds) { IReadOnlyList nonEmptyIndices = nRow .NSegments.Select((s, i) => (s, i)) - .Where(pair => pair.s.Count > 0 || nRow.GetIsInRange(pair.i)) + .Where(pair => pair.s.Count > 0 || nRow.IsInRange(pair.i)) .Select(pair => pair.i) .ToList(); IReadOnlyList indices = nonEmptyIndices.Count > 0 ? nonEmptyIndices : Enumerable.Range(0, nRow.N).ToList(); if (indexOfInRangeRow == -1) { - indices = indices.Where(i => nRow.GetIsRangeStart(i) || !nRow.GetIsInRange(i)).ToList(); + indices = indices.Where(i => nRow.IsRangeStart(i) || !nRow.IsInRange(i)).ToList(); } if (indices.Count == 0) continue; @@ -601,11 +604,11 @@ public override IEnumerable GetRows(IEnumerable textIds) break; } indexOfSelectedRow = indexOfInRangeRow != -1 ? indexOfInRangeRow : indexOfSelectedRow; - if (!nRow.GetIsInRange(indexOfSelectedRow)) + if (!nRow.IsInRange(indexOfSelectedRow)) { indexOfInRangeRow = -1; } - if (nRow.GetIsRangeStart(indexOfSelectedRow)) + if (nRow.IsRangeStart(indexOfSelectedRow)) { indexOfInRangeRow = indexOfSelectedRow; } diff --git a/src/SIL.Machine/Corpora/INParallelTextCorpus.cs b/src/SIL.Machine/Corpora/INParallelTextCorpus.cs index 5a1e86f76..0dfde2fa3 100644 --- a/src/SIL.Machine/Corpora/INParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/INParallelTextCorpus.cs @@ -4,6 +4,10 @@ namespace SIL.Machine.Corpora { public interface INParallelTextCorpus : ICorpus { + int N { get; } + IReadOnlyList Corpora { get; } + + bool IsTokenized(int i); int Count(bool includeEmpty = true, IEnumerable textIds = null); IEnumerable GetRows(IEnumerable textIds); diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index b9da97976..f5dfd1191 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -3,7 +3,6 @@ using System.Collections.Immutable; using System.Linq; using SIL.Extensions; -using SIL.Linq; using SIL.Scripture; namespace SIL.Machine.Corpora @@ -16,62 +15,51 @@ public NParallelTextCorpus(IEnumerable corpora, IComparer r if (Corpora.Count < 1) throw new ArgumentException("There must be at least one corpora.", nameof(corpora)); RowRefComparer = rowRefComparer ?? new DefaultRowRefComparer(); - AllRowsList = new bool[Corpora.Count] + AllRows = new bool[Corpora.Count] .Select(_ => false) .ToImmutableArray(); } - public bool GetIsTokenized(int i) => + public override bool IsTokenized(int i) => i < Corpora.Count ? Corpora[i].IsTokenized : throw new ArgumentOutOfRangeException(nameof(i)); - public int N => Corpora.Count; - - public IReadOnlyList AllRowsList { get; set; } - public IReadOnlyList Corpora { get; } + public override int N => Corpora.Count; + public IReadOnlyList AllRows { get; set; } + public override IReadOnlyList Corpora { get; } public IAlignmentCorpus AlignmentCorpus { get; set; } public IComparer RowRefComparer { get; } - private static HashSet GetTextIdsFromCorpora( - IEnumerable corpora, - IEnumerable allRowsEnumerate - ) + private HashSet GetTextIdsFromCorpora() { - IReadOnlyList> textIdListOfLists = corpora - .Select(c => c.Texts.Select(t => t.Id)) - .ToImmutableArray(); - - HashSet textIds = textIdListOfLists - .Skip(1) - .Aggregate( - new HashSet(textIdListOfLists.First()), - (h, e) => - { - h.IntersectWith(e); - return h; - } - ); - allRowsEnumerate - .Select((allRows, i) => (allRows, i)) - .Where(t => t.allRows) - .ForEach(t => textIds.UnionWith(textIdListOfLists[t.i])); + HashSet textIds = new HashSet(); + HashSet allRowsTextIds = new HashSet(); + for (int i = 0; i < Corpora.Count; i++) + { + if (i == 0) + textIds.AddRange(Corpora[i].Texts.Select(t => t.Id)); + else + textIds.IntersectWith(Corpora[i].Texts.Select(t => t.Id)); + if (AllRows[i]) + allRowsTextIds.AddRange(Corpora[i].Texts.Select(t => t.Id)); + } + textIds.UnionWith(allRowsTextIds); return textIds; } public override IEnumerable GetRows(IEnumerable textIds) { - HashSet filterTextIds = GetTextIdsFromCorpora(Corpora, AllRowsList); + HashSet filterTextIds = GetTextIdsFromCorpora(); if (textIds != null) filterTextIds.IntersectWith(textIds); IEnumerator alignmentEnumerator = null; - IList> enumeratedCorpora = new List>(); - IEnumerable rows = new List() { }; + List> enumeratedCorpora = new List>(); try { for (int i = 0; i < Corpora.Count; i++) { - var enumerator = Corpora[i].GetRows(filterTextIds).GetEnumerator(); + IEnumerator enumerator = Corpora[i].GetRows(filterTextIds).GetEnumerator(); enumeratedCorpora.Add( new TextCorpusEnumerator(enumerator, Corpora[0].Versification, Corpora[i].Versification) ); @@ -79,7 +67,8 @@ public override IEnumerable GetRows(IEnumerable textId if (AlignmentCorpus != null) alignmentEnumerator = AlignmentCorpus.GetRows(filterTextIds).GetEnumerator(); - rows = GetRows(enumeratedCorpora, alignmentEnumerator).ToList(); + foreach (NParallelTextRow row in GetRows(enumeratedCorpora, alignmentEnumerator)) + yield return row; } finally { @@ -89,10 +78,9 @@ public override IEnumerable GetRows(IEnumerable textId } alignmentEnumerator?.Dispose(); } - return rows; } - private bool AllInRangeHaveSegments(IList rows) + private static bool AllInRangeHaveSegments(IList rows) { return rows.All(r => (r.IsInRange && r.Segment.Count > 0) || (!r.IsInRange)); } @@ -100,7 +88,7 @@ private bool AllInRangeHaveSegments(IList rows) private IList MinRefIndexes(IList refs) { object minRef = refs[0]; - IList minRefIndexes = new List() { 0 }; + List minRefIndexes = new List() { 0 }; for (int i = 1; i < refs.Count; i++) { if (RowRefComparer.Compare(refs[i], minRef) < 0) @@ -118,7 +106,7 @@ private IList MinRefIndexes(IList refs) } private IEnumerable GetRows( - IList> listOfEnumerators, + IList> enumerators, IEnumerator alignmentEnumerator ) { @@ -129,63 +117,65 @@ IEnumerator alignmentEnumerator RowRefComparer = RowRefComparer }; - bool[] completed = listOfEnumerators.Select(e => !e.MoveNext()).ToArray(); + bool[] completed = enumerators.Select(e => !e.MoveNext()).ToArray(); while (!completed.All(c => c)) { - IList minRefIndexes; - IList currentRows = listOfEnumerators.Select(e => e.Current).ToArray(); + List minRefIndexes; + List currentRows = enumerators.Select(e => e.Current).ToList(); try { minRefIndexes = MinRefIndexes( - currentRows - .Select(e => - { - if (e != null) - return e.Ref; - return null; - }) - .ToArray() - ); + currentRows + .Select( + (e, i) => + { + if (!completed[i]) + return e.Ref; + return null; + } + ) + .ToArray() + ) + .ToList(); } catch (ArgumentException) { throw new CorpusAlignmentException(currentRows.Select(e => e.Ref.ToString()).ToArray()); } - var currentIncompleteRows = currentRows.Where((r, i) => !completed[i]).ToArray(); - IList nonMinRefIndexes = System.Linq.Enumerable.Range(0, N).Except(minRefIndexes).ToList(); - - if ( - minRefIndexes.Count < (N - completed.Count(c => c)) - || completed.Where((c, i) => !c && minRefIndexes.Contains(i)).Count() == 1 - ) //then there are some non-min refs or only one incomplete enumerator + TextRow[] currentIncompleteRows = currentRows.Where((r, i) => !completed[i]).ToArray(); + List nonMinRefIndexes = Enumerable.Range(0, N).Except(minRefIndexes).ToList(); + int numberOfRemainingRows = N - completed.Count(c => c); + if (minRefIndexes.Count < numberOfRemainingRows || minRefIndexes.Count(i => !completed[i]) == 1) + //then there are some non-min refs or only one incomplete enumerator { - IList> minEnumerators = minRefIndexes - .Select(i => listOfEnumerators[i]) - .ToList(); - IList> nonMinEnumerators = nonMinRefIndexes - .Select(i => listOfEnumerators[i]) + List> minEnumerators = minRefIndexes.Select(i => enumerators[i]).ToList(); + List> nonMinEnumerators = nonMinRefIndexes + .Select(i => enumerators[i]) .ToList(); if ( - nonMinRefIndexes.Any(i => !AllRowsList[i]) - && minRefIndexes.Where(i => !completed[i] && listOfEnumerators[i].Current.IsInRange).Any() + nonMinRefIndexes.Any(i => !AllRows[i]) + && minRefIndexes.Any(i => !completed[i] && currentRows[i].IsInRange) ) { if ( rangeInfo.IsInRange - && nonMinEnumerators - .Where(e => e.Current != null && e.Current.IsInRange && e.Current.Segment.Count > 0) - .Any() + && nonMinEnumerators.Any(e => + e.Current != null && e.Current.IsInRange && e.Current.Segment.Count > 0 + ) ) { yield return rangeInfo.CreateRow(); } - minRefIndexes.ForEach(i => rangeInfo.AddTextRow(listOfEnumerators[i].Current, i)); + minRefIndexes.ForEach(i => rangeInfo.AddTextRow(enumerators[i].Current, i)); nonMinRefIndexes.ForEach(i => rangeInfo.Rows[i].SameRefRows.Clear()); } else { + bool anyNonMinEnumeratorsMidRange = nonMinRefIndexes.Any(i => + !completed[i] && !currentRows[i].IsRangeStart && currentRows[i].IsInRange + ); foreach ( NParallelTextRow row in CreateMinRefRows( rangeInfo, @@ -194,13 +184,10 @@ NParallelTextRow row in CreateMinRefRows( nonMinRefIndexes.ToArray(), forceInRange: minRefIndexes .Select(i => - nonMinEnumerators.All(e => - e.Current != null && e.Current.TextId == currentRows[i].TextId + anyNonMinEnumeratorsMidRange + && nonMinRefIndexes.All(j => + !completed[j] && currentRows[j].TextId == currentRows[i].TextId ) - && nonMinEnumerators - .Where(e => e.Current != null) - .Select(e => !e.Current.IsRangeStart && e.Current.IsInRange) - .Any(b => b) ) .ToList() ) @@ -211,11 +198,11 @@ NParallelTextRow row in CreateMinRefRows( } foreach (int i in minRefIndexes) { - rangeInfo.Rows[i].SameRefRows.Add(listOfEnumerators[i].Current); - completed[i] = !listOfEnumerators[i].MoveNext(); + rangeInfo.Rows[i].SameRefRows.Add(enumerators[i].Current); + completed[i] = !enumerators[i].MoveNext(); } } - else if (minRefIndexes.Count == (N - completed.Count(c => c))) + else if (minRefIndexes.Count == numberOfRemainingRows) // the refs are all the same { int compareAlignmentCorpus = -1; @@ -244,8 +231,7 @@ NParallelTextRow row in CreateMinRefRows( if ( minRefIndexes .Select(i => - listOfEnumerators[i].Current.IsInRange - && minRefIndexes.All(j => j == i || !AllRowsList[j]) + enumerators[i].Current.IsInRange && minRefIndexes.All(j => j == i || !AllRows[j]) ) .Any(b => b) ) @@ -265,7 +251,7 @@ NParallelTextRow row in CreateMinRefRows( { for (int i = 0; i < rangeInfo.Rows.Count; i++) { - for (int j = 0; j < rangeInfo.Rows.Count; j++) + for (int j = 0; j < rangeInfo.Rows.Count; j++) //TODO rework { if (i == j || completed[i] || completed[j]) continue; @@ -302,7 +288,7 @@ NParallelTextRow row in CreateRows( for (int i = 0; i < rangeInfo.Rows.Count; i++) { rangeInfo.Rows[i].SameRefRows.Add(currentRows[i]); - completed[i] = !listOfEnumerators[i].MoveNext(); + completed[i] = !enumerators[i].MoveNext(); } } else @@ -382,17 +368,17 @@ private IEnumerable CreateMinRefRows( IReadOnlyList forceInRange = null ) { - List<(IList Rows, int Index)> sameRefRowsPerIndex = nonMinRefIndexes + List<(List Rows, int Index)> sameRefRowsPerIndex = nonMinRefIndexes .Select(i => (rangeInfo.Rows[i], i)) - .Select(pair => (pair.Item1.SameRefRows, pair.Item2)) + .Select(pair => (pair.Item1.SameRefRows.ToList(), pair.Item2)) .ToList(); - List alreadyYielded = new List(); + HashSet alreadyYielded = new HashSet(); TextRow[] textRows; foreach (int i in minRefIndexes) { TextRow textRow = currentRows[i]; - foreach ((IList sameRefRows, int j) in sameRefRowsPerIndex) + foreach ((List sameRefRows, int j) in sameRefRowsPerIndex) { if (i == j) continue; @@ -416,13 +402,15 @@ NParallelTextRow row in CreateRows(rangeInfo, textRows, forceInRange: forceInRan } textRows = new TextRow[N]; var forceCurrentInRange = new bool[N]; - foreach (int i in minRefIndexes.Where(i => AllRowsList[i]).Except(alreadyYielded)) + bool rowsHaveContent = false; + foreach (int i in minRefIndexes.Where(i => AllRows[i]).Except(alreadyYielded)) { TextRow textRow = currentRows[i]; textRows[i] = textRow; forceCurrentInRange[i] = forceCurrentInRange[i]; + rowsHaveContent = true; } - if (textRows.Any(tr => tr != null)) + if (rowsHaveContent) { foreach (NParallelTextRow row in CreateRows(rangeInfo, textRows, forceCurrentInRange)) { @@ -492,8 +480,11 @@ public void AddTextRow(TextRow row, int index) public NParallelTextRow CreateRow() { object[][] refs = new object[N][]; - IList referenceRefs = Rows.Where(r => r.Refs.Count > 0).Select(r => r.Refs).FirstOrDefault(); - foreach (int i in System.Linq.Enumerable.Range(0, Rows.Count)) + List referenceRefs = Rows.Where(r => r.Refs.Count > 0) + .Select(r => r.Refs) + .FirstOrDefault() + .ToList(); + foreach (int i in Enumerable.Range(0, Rows.Count)) { var row = Rows[i]; diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpusBase.cs b/src/SIL.Machine/Corpora/NParallelTextCorpusBase.cs index 5487b2001..73ccf56f8 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpusBase.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpusBase.cs @@ -6,6 +6,12 @@ namespace SIL.Machine.Corpora { public abstract class NParallelTextCorpusBase : INParallelTextCorpus { + public abstract int N { get; } + + public abstract IReadOnlyList Corpora { get; } + + public abstract bool IsTokenized(int i); + int ICorpus.Count(bool includeEmpty) { return Count(includeEmpty, null); diff --git a/src/SIL.Machine/Corpora/NParallelTextRow.cs b/src/SIL.Machine/Corpora/NParallelTextRow.cs index fd60d8d99..e76c57d93 100644 --- a/src/SIL.Machine/Corpora/NParallelTextRow.cs +++ b/src/SIL.Machine/Corpora/NParallelTextRow.cs @@ -33,18 +33,18 @@ public NParallelTextRow(string textId, IEnumerable> nRefs) public IReadOnlyList> NSegments { get; set; } public IReadOnlyList NFlags { get; set; } - public bool GetIsSentenceStart(int i) => + public bool IsSentenceStart(int i) => NFlags.Count > i ? NFlags[i].HasFlag(TextRowFlags.SentenceStart) : throw new ArgumentOutOfRangeException(); - public bool GetIsInRange(int i) => + public bool IsInRange(int i) => NFlags.Count > i ? NFlags[i].HasFlag(TextRowFlags.InRange) : throw new ArgumentOutOfRangeException(); - public bool GetIsRangeStart(int i) => + public bool IsRangeStart(int i) => NFlags.Count > i ? NFlags[i].HasFlag(TextRowFlags.RangeStart) : throw new ArgumentOutOfRangeException(); public bool IsEmpty => NSegments.All(s => s.Count == 0); - public string GetText(int i) => string.Join(" ", NSegments[i]); + public string Text(int i) => string.Join(" ", NSegments[i]); public IReadOnlyCollection AlignedWordPairs { get; set; } diff --git a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs index 8e79b5450..e1b64281b 100644 --- a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs @@ -29,14 +29,14 @@ public ParallelTextCorpus( public ITextCorpus SourceCorpus { get; } public ITextCorpus TargetCorpus { get; } - - public NParallelTextCorpus NParallelTextCorpus { get; set; } public IAlignmentCorpus AlignmentCorpus { get; } public IComparer RowRefComparer { get; } + private NParallelTextCorpus NParallelTextCorpus { get; set; } + public override IEnumerable GetRows(IEnumerable textIds) { - NParallelTextCorpus.AllRowsList = new bool[] { AllSourceRows, AllTargetRows }; + NParallelTextCorpus.AllRows = new bool[] { AllSourceRows, AllTargetRows }; bool isScripture = SourceCorpus.IsScripture() && TargetCorpus.IsScripture(); foreach (var nRow in NParallelTextCorpus.GetRows(textIds)) { diff --git a/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs b/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs index 592bfcc61..7653a135c 100644 --- a/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs +++ b/src/SIL.Machine/Corpora/TextCorpusEnumerator.cs @@ -6,7 +6,7 @@ namespace SIL.Machine.Corpora { - public class TextCorpusEnumerator : DisposableBase, IEnumerator + internal class TextCorpusEnumerator : DisposableBase, IEnumerator { private readonly IEnumerator _enumerator; private readonly bool _isScripture = false; diff --git a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs index 708b4ffab..2f8ec3a57 100644 --- a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs @@ -94,7 +94,7 @@ public void MergedCorpus_SelectFirst() } ) ); - var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [true, true, true] }; var mergedCorpus = nParallelCorpus.ChooseFirst(); var rows = mergedCorpus.ToArray(); Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); @@ -139,7 +139,7 @@ public void MergedCorpus_SelectRandom_Seed123456() } ) ); - var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [true, true, true] }; var mergedCorpus = nParallelCorpus.ChooseRandom(123456); var rows = mergedCorpus.ToArray(); Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); @@ -187,7 +187,7 @@ public void MergedCorpus_SelectRandom_Seed4501() } ) ); - var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [true, true, true] }; var mergedCorpus = nParallelCorpus.ChooseRandom(4501); var rows = mergedCorpus.ToArray(); Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); diff --git a/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs b/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs index ee3a9150f..68dc9f90c 100644 --- a/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/NParallelTextCorpusTests.cs @@ -47,12 +47,12 @@ public void GetRows_ThreeCorpora() Assert.That(rows.Length, Is.EqualTo(3)); Assert.That(rows[0].NRefs.All(r => (int)r[0] == 1)); Assert.That(rows[0].NSegments.All(r => r.SequenceEqual("source segment 1 .".Split()))); - Assert.That(rows[0].GetIsSentenceStart(0), Is.False); - Assert.That(rows[0].GetIsSentenceStart(1), Is.True); + Assert.That(rows[0].IsSentenceStart(0), Is.False); + Assert.That(rows[0].IsSentenceStart(1), Is.True); Assert.That(rows[2].NRefs.All(r => (int)r[0] == 3)); Assert.That(rows[2].NSegments.All(r => r.SequenceEqual("source segment 3 .".Split()))); - Assert.That(rows[2].GetIsSentenceStart(1), Is.False); - Assert.That(rows[2].GetIsSentenceStart(2), Is.True); + Assert.That(rows[2].IsSentenceStart(1), Is.False); + Assert.That(rows[2].IsSentenceStart(2), Is.True); } [Test] @@ -86,8 +86,8 @@ public void GetRows_ThreeCorpora_MissingRows() Assert.That(rows.Length, Is.EqualTo(1)); Assert.That(rows[0].NRefs.All(r => (int)r[0] == 3)); Assert.That(rows[0].NSegments.All(r => r.SequenceEqual("source segment 3 .".Split()))); - Assert.That(rows[0].GetIsSentenceStart(0), Is.True); - Assert.That(rows[0].GetIsSentenceStart(1), Is.False); + Assert.That(rows[0].IsSentenceStart(0), Is.True); + Assert.That(rows[0].IsSentenceStart(1), Is.False); } [Test] @@ -116,13 +116,13 @@ public void GetRows_ThreeCorpora_MissingRows_AllAllRows() var corpus3 = new DictionaryTextCorpus( new MemoryText("text1", new[] { TextRow("text1", 3, "source segment 3 .") }) ); - var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [true, true, true] }; NParallelTextRow[] rows = nParallelCorpus.ToArray(); Assert.That(rows.Length, Is.EqualTo(3)); Assert.That(rows[2].NRefs.All(r => (int)r[0] == 3)); Assert.That(rows[2].NSegments.All(r => r.SequenceEqual("source segment 3 .".Split()))); - Assert.That(rows[2].GetIsSentenceStart(0), Is.True); - Assert.That(rows[2].GetIsSentenceStart(1), Is.False); + Assert.That(rows[2].IsSentenceStart(0), Is.True); + Assert.That(rows[2].IsSentenceStart(1), Is.False); } [Test] @@ -151,16 +151,13 @@ public void GetRows_ThreeCorpora_MissingRows_SomeAllRows() var corpus3 = new DictionaryTextCorpus( new MemoryText("text1", new[] { TextRow("text1", 3, "source segment 3 .") }) ); - var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) - { - AllRowsList = [true, false, true] - }; + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [true, false, true] }; NParallelTextRow[] rows = nParallelCorpus.ToArray(); Assert.That(rows.Length, Is.EqualTo(2)); Assert.That(rows[1].NRefs.All(r => (int)r[0] == 3)); Assert.That(rows[1].NSegments.All(r => r.SequenceEqual("source segment 3 .".Split()))); - Assert.That(rows[1].GetIsSentenceStart(0), Is.True); - Assert.That(rows[1].GetIsSentenceStart(1), Is.False); + Assert.That(rows[1].IsSentenceStart(0), Is.True); + Assert.That(rows[1].IsSentenceStart(1), Is.False); } [Test] @@ -198,12 +195,12 @@ public void GetRows_ThreeCorpora_MissingRows_AllAllRows_MissingMiddle() } ) ); - var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRowsList = [true, true, true] }; + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [true, true, true] }; NParallelTextRow[] rows = nParallelCorpus.ToArray(); Assert.That(rows.Length, Is.EqualTo(3)); Assert.That(rows[1].NRefs.All(r => r.Count == 0 || (int)r[0] == 2)); Assert.That(rows[1].NSegments.All(r => r.Count == 0 || r.SequenceEqual("source segment 2 .".Split()))); - Assert.That(rows[1].GetIsSentenceStart(1), Is.True); + Assert.That(rows[1].IsSentenceStart(1), Is.True); } [Test] @@ -226,15 +223,12 @@ public void GetRows_ThreeCorpora_MissingRows_MissingLastRows() var corpus3 = new DictionaryTextCorpus( new MemoryText("text1", new[] { TextRow("text1", 1, "source segment 1 .") }) ); - var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) - { - AllRowsList = [true, false, false] - }; + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [true, false, false] }; NParallelTextRow[] rows = nParallelCorpus.ToArray(); Assert.That(rows.Length, Is.EqualTo(3)); Assert.That(rows[1].NRefs.All(r => r.Count == 0 || (int)r[0] == 2)); Assert.That(rows[1].NSegments.All(r => r.Count == 0 || r.SequenceEqual("source segment 2 .".Split()))); - Assert.That(rows[1].GetIsSentenceStart(0), Is.True); + Assert.That(rows[1].IsSentenceStart(0), Is.True); } [Test] @@ -250,12 +244,12 @@ public void GetRows_OneCorpus() } ) ); - var nParallelCorpus = new NParallelTextCorpus([corpus1]) { AllRowsList = [true] }; + var nParallelCorpus = new NParallelTextCorpus([corpus1]) { AllRows = [true] }; NParallelTextRow[] rows = nParallelCorpus.ToArray(); Assert.That(rows.Length, Is.EqualTo(2)); Assert.That(rows[0].NRefs.All(r => (int)r[0] == 1)); Assert.That(rows[0].NSegments.All(r => r.SequenceEqual("source segment 1 .".Split()))); - Assert.That(rows[0].GetIsSentenceStart(0), Is.False); + Assert.That(rows[0].IsSentenceStart(0), Is.False); } [Test] @@ -406,10 +400,7 @@ public void GetRows_ThreeCorpora_OverlappingRanges_AllIndividualRows() } ) ); - var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) - { - AllRowsList = [false, false, true] - }; + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [false, false, true] }; NParallelTextRow[] rows = nParallelCorpus.ToArray(); Assert.That(rows.Length, Is.EqualTo(3), JsonSerializer.Serialize(rows)); Assert.That(rows[0].NRefs[0], Is.EquivalentTo(new object[] { 1 })); @@ -461,10 +452,7 @@ public void GetRows_ThreeCorpora_OverlappingRanges_AllRangeOneThroughTwoRows() } ) ); - var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) - { - AllRowsList = [false, true, false] - }; + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [false, true, false] }; NParallelTextRow[] rows = nParallelCorpus.ToArray(); Assert.That(rows.Length, Is.EqualTo(2), JsonSerializer.Serialize(rows)); Assert.That(rows[0].NRefs[0], Is.EquivalentTo(new object[] { 1, 2 })); @@ -516,10 +504,7 @@ public void GetRows_ThreeCorpora_OverlappingRanges_AllRangeTwoThroughThreeRows() } ) ); - var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) - { - AllRowsList = [true, false, false] - }; + var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [true, false, false] }; NParallelTextRow[] rows = nParallelCorpus.ToArray(); Assert.That(rows.Length, Is.EqualTo(2), JsonSerializer.Serialize(rows)); Assert.That(rows[0].NRefs[0], Is.EquivalentTo(new object[] { 1 })); From 54ae315cf94f22a100315021fd652da64d52a533 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 13 Nov 2024 18:15:36 -0500 Subject: [PATCH 16/26] Reviewer changes --- .../Corpora/NParallelTextCorpus.cs | 300 +++++++++--------- src/SIL.Machine/Corpora/ParallelTextCorpus.cs | 2 +- 2 files changed, 149 insertions(+), 153 deletions(-) diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index f5dfd1191..bc5210c49 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -110,198 +110,194 @@ private IEnumerable GetRows( IEnumerator alignmentEnumerator ) { + var rangeInfo = new NRangeInfo(N) { - var rangeInfo = new NRangeInfo(N) - { - Versifications = Corpora.Select(c => c.Versification).ToArray(), - RowRefComparer = RowRefComparer - }; + Versifications = Corpora.Select(c => c.Versification).ToArray(), + RowRefComparer = RowRefComparer + }; - bool[] completed = enumerators.Select(e => !e.MoveNext()).ToArray(); + bool[] completed = enumerators.Select(e => !e.MoveNext()).ToArray(); - while (!completed.All(c => c)) + while (!completed.All(c => c)) + { + List minRefIndexes; + List currentRows = enumerators.Select(e => e.Current).ToList(); + try + { + minRefIndexes = MinRefIndexes( + currentRows + .Select( + (e, i) => + { + if (!completed[i]) + return e.Ref; + return null; + } + ) + .ToArray() + ) + .ToList(); + } + catch (ArgumentException) { - List minRefIndexes; - List currentRows = enumerators.Select(e => e.Current).ToList(); - try + throw new CorpusAlignmentException(currentRows.Select(e => e.Ref.ToString()).ToArray()); + } + TextRow[] currentIncompleteRows = currentRows.Where((r, i) => !completed[i]).ToArray(); + List nonMinRefIndexes = Enumerable.Range(0, N).Except(minRefIndexes).ToList(); + int numberOfRemainingRows = N - completed.Count(c => c); + if (minRefIndexes.Count < numberOfRemainingRows || minRefIndexes.Count(i => !completed[i]) == 1) + //then there are some non-min refs or only one incomplete enumerator + { + List> minEnumerators = minRefIndexes.Select(i => enumerators[i]).ToList(); + List> nonMinEnumerators = nonMinRefIndexes + .Select(i => enumerators[i]) + .ToList(); + + if ( + nonMinRefIndexes.Any(i => !AllRows[i]) + && minRefIndexes.Any(i => !completed[i] && currentRows[i].IsInRange) + ) { - minRefIndexes = MinRefIndexes( - currentRows - .Select( - (e, i) => - { - if (!completed[i]) - return e.Ref; - return null; - } + if ( + rangeInfo.IsInRange + && nonMinEnumerators.Any(e => + e.Current != null && e.Current.IsInRange && e.Current.Segment.Count > 0 + ) + ) + { + yield return rangeInfo.CreateRow(); + } + minRefIndexes.ForEach(i => rangeInfo.AddTextRow(enumerators[i].Current, i)); + nonMinRefIndexes.ForEach(i => rangeInfo.Rows[i].SameRefRows.Clear()); + } + else + { + bool anyNonMinEnumeratorsMidRange = nonMinRefIndexes.Any(i => + !completed[i] && !currentRows[i].IsRangeStart && currentRows[i].IsInRange + ); + foreach ( + NParallelTextRow row in CreateMinRefRows( + rangeInfo, + currentRows.ToArray(), + minRefIndexes.ToArray(), + nonMinRefIndexes.ToArray(), + forceInRange: minRefIndexes + .Select(i => + anyNonMinEnumeratorsMidRange + && nonMinRefIndexes.All(j => + !completed[j] && currentRows[j].TextId == currentRows[i].TextId + ) ) - .ToArray() + .ToList() ) - .ToList(); + ) + { + yield return row; + } } - catch (ArgumentException) + foreach (int i in minRefIndexes) { - throw new CorpusAlignmentException(currentRows.Select(e => e.Ref.ToString()).ToArray()); + rangeInfo.Rows[i].SameRefRows.Add(enumerators[i].Current); + completed[i] = !enumerators[i].MoveNext(); } - TextRow[] currentIncompleteRows = currentRows.Where((r, i) => !completed[i]).ToArray(); - List nonMinRefIndexes = Enumerable.Range(0, N).Except(minRefIndexes).ToList(); - int numberOfRemainingRows = N - completed.Count(c => c); - if (minRefIndexes.Count < numberOfRemainingRows || minRefIndexes.Count(i => !completed[i]) == 1) - //then there are some non-min refs or only one incomplete enumerator + } + else if (minRefIndexes.Count == numberOfRemainingRows) + // the refs are all the same + { + int compareAlignmentCorpus = -1; + if (AlignmentCorpus != null) { - List> minEnumerators = minRefIndexes.Select(i => enumerators[i]).ToList(); - List> nonMinEnumerators = nonMinRefIndexes - .Select(i => enumerators[i]) - .ToList(); - - if ( - nonMinRefIndexes.Any(i => !AllRows[i]) - && minRefIndexes.Any(i => !completed[i] && currentRows[i].IsInRange) - ) + do { - if ( - rangeInfo.IsInRange - && nonMinEnumerators.Any(e => - e.Current != null && e.Current.IsInRange && e.Current.Segment.Count > 0 - ) - ) + try { - yield return rangeInfo.CreateRow(); + compareAlignmentCorpus = alignmentEnumerator.MoveNext() + ? RowRefComparer.Compare( + currentIncompleteRows[0].Ref, + alignmentEnumerator.Current.Ref + ) + : 1; } - minRefIndexes.ForEach(i => rangeInfo.AddTextRow(enumerators[i].Current, i)); - nonMinRefIndexes.ForEach(i => rangeInfo.Rows[i].SameRefRows.Clear()); - } - else - { - bool anyNonMinEnumeratorsMidRange = nonMinRefIndexes.Any(i => - !completed[i] && !currentRows[i].IsRangeStart && currentRows[i].IsInRange - ); - foreach ( - NParallelTextRow row in CreateMinRefRows( - rangeInfo, - currentRows.ToArray(), - minRefIndexes.ToArray(), - nonMinRefIndexes.ToArray(), - forceInRange: minRefIndexes - .Select(i => - anyNonMinEnumeratorsMidRange - && nonMinRefIndexes.All(j => - !completed[j] && currentRows[j].TextId == currentRows[i].TextId - ) - ) - .ToList() - ) - ) + catch (ArgumentException) { - yield return row; + throw new CorpusAlignmentException(currentRows.Select(e => e.Ref.ToString()).ToArray()); } - } - foreach (int i in minRefIndexes) - { - rangeInfo.Rows[i].SameRefRows.Add(enumerators[i].Current); - completed[i] = !enumerators[i].MoveNext(); - } + } while (compareAlignmentCorpus < 0); } - else if (minRefIndexes.Count == numberOfRemainingRows) - // the refs are all the same + + if ( + minRefIndexes + .Select(i => + enumerators[i].Current.IsInRange && minRefIndexes.All(j => j == i || !AllRows[j]) + ) + .Any(b => b) + ) { - int compareAlignmentCorpus = -1; - if (AlignmentCorpus != null) + if (rangeInfo.IsInRange && AllInRangeHaveSegments(currentIncompleteRows)) { - do - { - try - { - compareAlignmentCorpus = alignmentEnumerator.MoveNext() - ? RowRefComparer.Compare( - currentIncompleteRows[0].Ref, - alignmentEnumerator.Current.Ref - ) - : 1; - } - catch (ArgumentException) - { - throw new CorpusAlignmentException( - currentRows.Select(e => e.Ref.ToString()).ToArray() - ); - } - } while (compareAlignmentCorpus < 0); + yield return rangeInfo.CreateRow(); } - if ( - minRefIndexes - .Select(i => - enumerators[i].Current.IsInRange && minRefIndexes.All(j => j == i || !AllRows[j]) - ) - .Any(b => b) - ) + for (int i = 0; i < rangeInfo.Rows.Count; i++) { - if (rangeInfo.IsInRange && AllInRangeHaveSegments(currentIncompleteRows)) - { - yield return rangeInfo.CreateRow(); - } - - for (int i = 0; i < rangeInfo.Rows.Count; i++) - { - rangeInfo.AddTextRow(currentRows[i], i); - rangeInfo.Rows[i].SameRefRows.Clear(); - } + rangeInfo.AddTextRow(currentRows[i], i); + rangeInfo.Rows[i].SameRefRows.Clear(); } - else + } + else + { + for (int i = 0; i < rangeInfo.Rows.Count; i++) { - for (int i = 0; i < rangeInfo.Rows.Count; i++) + for (int j = 0; j < rangeInfo.Rows.Count; j++) //TODO rework { - for (int j = 0; j < rangeInfo.Rows.Count; j++) //TODO rework - { - if (i == j || completed[i] || completed[j]) - continue; + if (i == j || completed[i] || completed[j]) + continue; - if (rangeInfo.CheckSameRefRows(rangeInfo.Rows[i].SameRefRows, currentRows[j])) + if (rangeInfo.CheckSameRefRows(rangeInfo.Rows[i].SameRefRows, currentRows[j])) + { + foreach (TextRow tr in rangeInfo.Rows[i].SameRefRows) { - foreach (TextRow tr in rangeInfo.Rows[i].SameRefRows) + var textRows = new TextRow[N]; + textRows[i] = tr; + textRows[j] = currentRows[j]; + foreach (NParallelTextRow r in CreateRows(rangeInfo, textRows)) { - var textRows = new TextRow[N]; - textRows[i] = tr; - textRows[j] = currentRows[j]; - foreach (NParallelTextRow r in CreateRows(rangeInfo, textRows)) - { - yield return r; - } + yield return r; } } } } - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - currentRows.Select((r, i) => completed[i] ? null : r).ToArray(), - alignedWordPairs: AlignmentCorpus != null && compareAlignmentCorpus == 0 - ? alignmentEnumerator.Current.AlignedWordPairs.ToArray() - : null - ) - ) - { - yield return row; - } } - - for (int i = 0; i < rangeInfo.Rows.Count; i++) + foreach ( + NParallelTextRow row in CreateRows( + rangeInfo, + currentRows.Select((r, i) => completed[i] ? null : r).ToArray(), + alignedWordPairs: AlignmentCorpus != null && compareAlignmentCorpus == 0 + ? alignmentEnumerator.Current.AlignedWordPairs.ToArray() + : null + ) + ) { - rangeInfo.Rows[i].SameRefRows.Add(currentRows[i]); - completed[i] = !enumerators[i].MoveNext(); + yield return row; } } - else + + for (int i = 0; i < rangeInfo.Rows.Count; i++) { - throw new CorpusAlignmentException( - minRefIndexes.Select(i => currentRows[i].Ref.ToString()).ToArray() - ); + rangeInfo.Rows[i].SameRefRows.Add(currentRows[i]); + completed[i] = !enumerators[i].MoveNext(); } } - - if (rangeInfo.IsInRange) - yield return rangeInfo.CreateRow(); + else + { + throw new CorpusAlignmentException( + minRefIndexes.Select(i => currentRows[i].Ref.ToString()).ToArray() + ); + } } + + if (rangeInfo.IsInRange) + yield return rangeInfo.CreateRow(); } private object[] CorrectVersification(object[] refs, int i) diff --git a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs index e1b64281b..f21acc7c0 100644 --- a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs @@ -32,7 +32,7 @@ public ParallelTextCorpus( public IAlignmentCorpus AlignmentCorpus { get; } public IComparer RowRefComparer { get; } - private NParallelTextCorpus NParallelTextCorpus { get; set; } + public NParallelTextCorpus NParallelTextCorpus { get; } public override IEnumerable GetRows(IEnumerable textIds) { From d97ea1cf7dbf644510659096f67fcb35302a8444 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 13 Nov 2024 19:29:48 -0500 Subject: [PATCH 17/26] Remove alignment corpus from NPTC; move logic to PTC --- .../Corpora/NParallelTextCorpus.cs | 45 ++------------ src/SIL.Machine/Corpora/NParallelTextRow.cs | 2 - src/SIL.Machine/Corpora/ParallelTextCorpus.cs | 58 +++++++++++++------ 3 files changed, 44 insertions(+), 61 deletions(-) diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index bc5210c49..4d59fcacd 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -26,7 +26,6 @@ public override bool IsTokenized(int i) => public override int N => Corpora.Count; public IReadOnlyList AllRows { get; set; } public override IReadOnlyList Corpora { get; } - public IAlignmentCorpus AlignmentCorpus { get; set; } public IComparer RowRefComparer { get; } private HashSet GetTextIdsFromCorpora() @@ -53,7 +52,6 @@ public override IEnumerable GetRows(IEnumerable textId if (textIds != null) filterTextIds.IntersectWith(textIds); - IEnumerator alignmentEnumerator = null; List> enumeratedCorpora = new List>(); try { @@ -64,10 +62,7 @@ public override IEnumerable GetRows(IEnumerable textId new TextCorpusEnumerator(enumerator, Corpora[0].Versification, Corpora[i].Versification) ); } - - if (AlignmentCorpus != null) - alignmentEnumerator = AlignmentCorpus.GetRows(filterTextIds).GetEnumerator(); - foreach (NParallelTextRow row in GetRows(enumeratedCorpora, alignmentEnumerator)) + foreach (NParallelTextRow row in GetRows(enumeratedCorpora)) yield return row; } finally @@ -76,7 +71,6 @@ public override IEnumerable GetRows(IEnumerable textId { enumerator.Dispose(); } - alignmentEnumerator?.Dispose(); } } @@ -105,10 +99,7 @@ private IList MinRefIndexes(IList refs) return minRefIndexes; } - private IEnumerable GetRows( - IList> enumerators, - IEnumerator alignmentEnumerator - ) + private IEnumerable GetRows(IList> enumerators) { var rangeInfo = new NRangeInfo(N) { @@ -204,27 +195,6 @@ NParallelTextRow row in CreateMinRefRows( else if (minRefIndexes.Count == numberOfRemainingRows) // the refs are all the same { - int compareAlignmentCorpus = -1; - if (AlignmentCorpus != null) - { - do - { - try - { - compareAlignmentCorpus = alignmentEnumerator.MoveNext() - ? RowRefComparer.Compare( - currentIncompleteRows[0].Ref, - alignmentEnumerator.Current.Ref - ) - : 1; - } - catch (ArgumentException) - { - throw new CorpusAlignmentException(currentRows.Select(e => e.Ref.ToString()).ToArray()); - } - } while (compareAlignmentCorpus < 0); - } - if ( minRefIndexes .Select(i => @@ -271,10 +241,7 @@ NParallelTextRow row in CreateMinRefRows( foreach ( NParallelTextRow row in CreateRows( rangeInfo, - currentRows.Select((r, i) => completed[i] ? null : r).ToArray(), - alignedWordPairs: AlignmentCorpus != null && compareAlignmentCorpus == 0 - ? alignmentEnumerator.Current.AlignedWordPairs.ToArray() - : null + currentRows.Select((r, i) => completed[i] ? null : r).ToArray() ) ) { @@ -313,8 +280,7 @@ private object[] CorrectVersification(object[] refs, int i) private IEnumerable CreateRows( NRangeInfo rangeInfo, IReadOnlyList rows, - IReadOnlyList forceInRange = null, - IReadOnlyList alignedWordPairs = null + IReadOnlyList forceInRange = null ) { if (rangeInfo.IsInRange) @@ -351,8 +317,7 @@ private IEnumerable CreateRows( yield return new NParallelTextRow(textId, refs) { NSegments = rows.Select(r => r?.Segment ?? Array.Empty()).ToArray(), - NFlags = flags.ToReadOnlyList(), - AlignedWordPairs = alignedWordPairs + NFlags = flags.ToReadOnlyList() }; } diff --git a/src/SIL.Machine/Corpora/NParallelTextRow.cs b/src/SIL.Machine/Corpora/NParallelTextRow.cs index e76c57d93..4d58e9079 100644 --- a/src/SIL.Machine/Corpora/NParallelTextRow.cs +++ b/src/SIL.Machine/Corpora/NParallelTextRow.cs @@ -46,8 +46,6 @@ public bool IsRangeStart(int i) => public string Text(int i) => string.Join(" ", NSegments[i]); - public IReadOnlyCollection AlignedWordPairs { get; set; } - public NParallelTextRow Invert() { return new NParallelTextRow(TextId, NRefs.Reverse()) { NFlags = NFlags.Reverse().ToImmutableArray(), }; diff --git a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs index f21acc7c0..89e8d6c15 100644 --- a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs @@ -1,4 +1,6 @@ -using System.Collections.Generic; +using System; +using System.Collections.Generic; +using System.Linq; namespace SIL.Machine.Corpora { @@ -15,10 +17,7 @@ public ParallelTextCorpus( TargetCorpus = targetCorpus; AlignmentCorpus = alignmentCorpus ?? new DictionaryAlignmentCorpus(); RowRefComparer = rowRefComparer ?? new NParallelTextCorpus.DefaultRowRefComparer(); - NParallelTextCorpus = new NParallelTextCorpus(new List { SourceCorpus, TargetCorpus }) - { - AlignmentCorpus = AlignmentCorpus - }; + NParallelTextCorpus = new NParallelTextCorpus(new List { SourceCorpus, TargetCorpus }); } public override bool IsSourceTokenized => SourceCorpus.IsTokenized; @@ -36,22 +35,43 @@ public ParallelTextCorpus( public override IEnumerable GetRows(IEnumerable textIds) { - NParallelTextCorpus.AllRows = new bool[] { AllSourceRows, AllTargetRows }; - bool isScripture = SourceCorpus.IsScripture() && TargetCorpus.IsScripture(); - foreach (var nRow in NParallelTextCorpus.GetRows(textIds)) + using (IEnumerator alignmentEnumerator = AlignmentCorpus.GetEnumerator()) { - yield return new ParallelTextRow( - nRow.TextId, - nRow.NRefs[0].Count > 0 || !isScripture ? nRow.NRefs[0] : new object[] { nRow.Ref }, - nRow.NRefs[1].Count > 0 || !isScripture ? nRow.NRefs[1] : new object[] { nRow.Ref } - ) + NParallelTextCorpus.AllRows = new bool[] { AllSourceRows, AllTargetRows }; + bool isScripture = SourceCorpus.IsScripture() && TargetCorpus.IsScripture(); + foreach (var nRow in NParallelTextCorpus.GetRows(textIds)) { - SourceFlags = nRow.NFlags[0], - TargetFlags = nRow.NFlags[1], - SourceSegment = nRow.NSegments[0], - TargetSegment = nRow.NSegments[1], - AlignedWordPairs = nRow.AlignedWordPairs - }; + int compareAlignmentCorpus = -1; + if (AlignmentCorpus != null && nRow.NSegments.All(s => s.Count > 0)) + { + do + { + try + { + compareAlignmentCorpus = alignmentEnumerator.MoveNext() + ? RowRefComparer.Compare(nRow.Ref, alignmentEnumerator.Current.Ref) + : 1; + } + catch (ArgumentException) + { + throw new CorpusAlignmentException(nRow.NRefs.Select(r => r.ToString()).ToArray()); + } + } while (compareAlignmentCorpus < 0); + } + yield return new ParallelTextRow( + nRow.TextId, + nRow.NRefs[0].Count > 0 || !isScripture ? nRow.NRefs[0] : new object[] { nRow.Ref }, + nRow.NRefs[1].Count > 0 || !isScripture ? nRow.NRefs[1] : new object[] { nRow.Ref } + ) + { + SourceFlags = nRow.NFlags[0], + TargetFlags = nRow.NFlags[1], + SourceSegment = nRow.NSegments[0], + TargetSegment = nRow.NSegments[1], + AlignedWordPairs = + compareAlignmentCorpus == 0 ? alignmentEnumerator.Current.AlignedWordPairs.ToArray() : null + }; + } } } } From 37dde83a6778a9466dcfc7f1bf6edd13080790bb Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 13 Nov 2024 19:33:23 -0500 Subject: [PATCH 18/26] Remove redundant check --- src/SIL.Machine/Corpora/NParallelTextCorpus.cs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index 4d59fcacd..fc6d432df 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -289,9 +289,7 @@ private IEnumerable CreateRows( if (rows.All(r => r == null)) throw new ArgumentNullException("A corpus row must be specified."); - object[] defaultRefs = new object[] { }; - if (rows.Any(r => r != null)) - defaultRefs = new object[] { rows.Where(r => r != null).Select(r => r.Ref).First() }; + object[] defaultRefs = new object[] { rows.Where(r => r != null).Select(r => r.Ref).First() }; string textId = null; object[][] refs = new object[N][]; TextRowFlags[] flags = new TextRowFlags[N]; From b3bd7b9cf60ed0f850ede7517c6ea59e917f145a Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 14 Nov 2024 09:25:11 -0500 Subject: [PATCH 19/26] Property to field --- src/SIL.Machine/Corpora/ParallelTextCorpus.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs index 89e8d6c15..53d07257c 100644 --- a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs @@ -17,7 +17,7 @@ public ParallelTextCorpus( TargetCorpus = targetCorpus; AlignmentCorpus = alignmentCorpus ?? new DictionaryAlignmentCorpus(); RowRefComparer = rowRefComparer ?? new NParallelTextCorpus.DefaultRowRefComparer(); - NParallelTextCorpus = new NParallelTextCorpus(new List { SourceCorpus, TargetCorpus }); + _nParallelTextCorpus = new NParallelTextCorpus(new List { SourceCorpus, TargetCorpus }); } public override bool IsSourceTokenized => SourceCorpus.IsTokenized; @@ -31,15 +31,15 @@ public ParallelTextCorpus( public IAlignmentCorpus AlignmentCorpus { get; } public IComparer RowRefComparer { get; } - public NParallelTextCorpus NParallelTextCorpus { get; } + private readonly NParallelTextCorpus _nParallelTextCorpus; public override IEnumerable GetRows(IEnumerable textIds) { using (IEnumerator alignmentEnumerator = AlignmentCorpus.GetEnumerator()) { - NParallelTextCorpus.AllRows = new bool[] { AllSourceRows, AllTargetRows }; + _nParallelTextCorpus.AllRows = new bool[] { AllSourceRows, AllTargetRows }; bool isScripture = SourceCorpus.IsScripture() && TargetCorpus.IsScripture(); - foreach (var nRow in NParallelTextCorpus.GetRows(textIds)) + foreach (var nRow in _nParallelTextCorpus.GetRows(textIds)) { int compareAlignmentCorpus = -1; if (AlignmentCorpus != null && nRow.NSegments.All(s => s.Count > 0)) From 0d351f2b616d5a87b24cd1ab8a6a5fcf54de5cbc Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 14 Nov 2024 10:39:00 -0500 Subject: [PATCH 20/26] Comments, small refactoring --- .../Corpora/NParallelTextCorpus.cs | 81 +++++++++---------- 1 file changed, 40 insertions(+), 41 deletions(-) diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index fc6d432df..ae4c6e1f7 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -38,6 +38,7 @@ private HashSet GetTextIdsFromCorpora() textIds.AddRange(Corpora[i].Texts.Select(t => t.Id)); else textIds.IntersectWith(Corpora[i].Texts.Select(t => t.Id)); + if (AllRows[i]) allRowsTextIds.AddRange(Corpora[i].Texts.Select(t => t.Id)); } @@ -68,15 +69,13 @@ public override IEnumerable GetRows(IEnumerable textId finally { foreach (IEnumerator enumerator in enumeratedCorpora) - { enumerator.Dispose(); - } } } private static bool AllInRangeHaveSegments(IList rows) { - return rows.All(r => (r.IsInRange && r.Segment.Count > 0) || (!r.IsInRange)); + return rows.All(r => (r.IsInRange && !r.IsEmpty) || (!r.IsInRange)); } private IList MinRefIndexes(IList refs) @@ -133,33 +132,29 @@ private IEnumerable GetRows(IList> enumer { throw new CorpusAlignmentException(currentRows.Select(e => e.Ref.ToString()).ToArray()); } - TextRow[] currentIncompleteRows = currentRows.Where((r, i) => !completed[i]).ToArray(); List nonMinRefIndexes = Enumerable.Range(0, N).Except(minRefIndexes).ToList(); int numberOfRemainingRows = N - completed.Count(c => c); if (minRefIndexes.Count < numberOfRemainingRows || minRefIndexes.Count(i => !completed[i]) == 1) //then there are some non-min refs or only one incomplete enumerator { - List> minEnumerators = minRefIndexes.Select(i => enumerators[i]).ToList(); - List> nonMinEnumerators = nonMinRefIndexes - .Select(i => enumerators[i]) - .ToList(); - if ( - nonMinRefIndexes.Any(i => !AllRows[i]) - && minRefIndexes.Any(i => !completed[i] && currentRows[i].IsInRange) + nonMinRefIndexes.Any(i => !AllRows[i]) //At least one of the non-min rows has not been marked as 'all rows' + && minRefIndexes.Any(i => !completed[i] && currentRows[i].IsInRange) //and at least one of the min rows is not completed and in a range ) { if ( rangeInfo.IsInRange - && nonMinEnumerators.Any(e => - e.Current != null && e.Current.IsInRange && e.Current.Segment.Count > 0 - ) + && nonMinRefIndexes.Any(i => + !completed[i] && currentRows[i].IsInRange && !currentRows[i].IsEmpty + ) //At least one of the non-min rows is not completed, is in a range, and has content ) { yield return rangeInfo.CreateRow(); } - minRefIndexes.ForEach(i => rangeInfo.AddTextRow(enumerators[i].Current, i)); - nonMinRefIndexes.ForEach(i => rangeInfo.Rows[i].SameRefRows.Clear()); + foreach (int i in minRefIndexes) + rangeInfo.AddTextRow(enumerators[i].Current, i); + foreach (int i in nonMinRefIndexes) + rangeInfo.Rows[i].SameRefRows.Clear(); } else { @@ -179,7 +174,7 @@ NParallelTextRow row in CreateMinRefRows( !completed[j] && currentRows[j].TextId == currentRows[i].TextId ) ) - .ToList() + .ToList() //TODO refactor ) ) { @@ -196,14 +191,15 @@ NParallelTextRow row in CreateMinRefRows( // the refs are all the same { if ( - minRefIndexes - .Select(i => - enumerators[i].Current.IsInRange && minRefIndexes.All(j => j == i || !AllRows[j]) - ) - .Any(b => b) + minRefIndexes.Any(i => + currentRows[i].IsInRange && minRefIndexes.All(j => j == i || !AllRows[j]) + ) //At least one row is in range while the other rows are all not marked as 'all rows' ) { - if (rangeInfo.IsInRange && AllInRangeHaveSegments(currentIncompleteRows)) + if ( + rangeInfo.IsInRange + && AllInRangeHaveSegments(currentRows.Where((r, i) => !completed[i]).ToArray()) + ) { yield return rangeInfo.CreateRow(); } @@ -218,12 +214,15 @@ NParallelTextRow row in CreateMinRefRows( { for (int i = 0; i < rangeInfo.Rows.Count; i++) { - for (int j = 0; j < rangeInfo.Rows.Count; j++) //TODO rework + if (completed[i]) + continue; + + for (int j = 0; j < rangeInfo.Rows.Count; j++) { - if (i == j || completed[i] || completed[j]) + if (i == j || completed[j]) continue; - if (rangeInfo.CheckSameRefRows(rangeInfo.Rows[i].SameRefRows, currentRows[j])) + if (CheckSameRefRows(rangeInfo.Rows[i].SameRefRows, currentRows[j])) { foreach (TextRow tr in rangeInfo.Rows[i].SameRefRows) { @@ -341,7 +340,7 @@ private IEnumerable CreateMinRefRows( { if (i == j) continue; - if (rangeInfo.CheckSameRefRows(sameRefRows, textRow)) + if (CheckSameRefRows(sameRefRows, textRow)) { alreadyYielded.Add(i); foreach (TextRow sameRefRow in sameRefRows) @@ -378,6 +377,20 @@ NParallelTextRow row in CreateRows(rangeInfo, textRows, forceInRange: forceInRan } } + private bool CheckSameRefRows(IList sameRefRows, TextRow otherRow) + { + try + { + if (sameRefRows.Count > 0 && RowRefComparer.Compare(sameRefRows[0].Ref, otherRow.Ref) != 0) + sameRefRows.Clear(); + } + catch (ArgumentException) + { + throw new CorpusAlignmentException(sameRefRows[0].Ref.ToString(), otherRow.Ref.ToString()); + } + return sameRefRows.Count > 0; + } + private class RangeRow { public IList Refs { get; } = new List(); @@ -407,20 +420,6 @@ public NRangeInfo(int n) } } - public bool CheckSameRefRows(IList sameRefRows, TextRow otherRow) - { - try - { - if (sameRefRows.Count > 0 && RowRefComparer.Compare(sameRefRows[0].Ref, otherRow.Ref) != 0) - sameRefRows.Clear(); - } - catch (ArgumentException) - { - throw new CorpusAlignmentException(sameRefRows[0].Ref.ToString(), otherRow.Ref.ToString()); - } - return sameRefRows.Count > 0; - } - public void AddTextRow(TextRow row, int index) { if (N <= index) From 14697e44ecbae90351cda05ab094ff13aef52d89 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Thu, 14 Nov 2024 15:32:05 -0500 Subject: [PATCH 21/26] More fixes --- src/SIL.Machine/Corpora/CorporaExtensions.cs | 93 ++++-------------- .../Corpora/INParallelTextCorpus.cs | 4 - src/SIL.Machine/Corpora/MergeRule.cs | 8 ++ src/SIL.Machine/Corpora/MergedTextCorpus.cs | 74 +++++++++++++++ .../Corpora/NParallelTextCorpus.cs | 95 ++++++++++++------- .../Corpora/CorporaExtensionsTests.cs | 21 ++-- 6 files changed, 170 insertions(+), 125 deletions(-) create mode 100644 src/SIL.Machine/Corpora/MergeRule.cs create mode 100644 src/SIL.Machine/Corpora/MergedTextCorpus.cs diff --git a/src/SIL.Machine/Corpora/CorporaExtensions.cs b/src/SIL.Machine/Corpora/CorporaExtensions.cs index 38ad693c7..33186799c 100644 --- a/src/SIL.Machine/Corpora/CorporaExtensions.cs +++ b/src/SIL.Machine/Corpora/CorporaExtensions.cs @@ -538,87 +538,26 @@ public static INParallelTextCorpus AlignMany( return nParallelTextCorpus; } - public static ITextCorpus ChooseRandom(this INParallelTextCorpus corpus, int seed) - { - return new MergedCorpus(corpus, MergeRule.Random, seed); - } - - public static ITextCorpus ChooseFirst(this INParallelTextCorpus corpus) - { - return new MergedCorpus(corpus, MergeRule.First, 0); - } - - private enum MergeRule + public static ITextCorpus ChooseRandom( + this IEnumerable corpora, + IEnumerable allRows, + int seed + ) { - First, - Random + return new MergedTextCorpus( + new NParallelTextCorpus(corpora) { AllRows = allRows.ToArray() }, + MergeRule.Random, + seed + ); } - private class MergedCorpus : TextCorpusBase + public static ITextCorpus ChooseFirst(this IEnumerable corpora, IEnumerable allRows) { - private readonly INParallelTextCorpus _corpus; - - private readonly MergeRule _mergeRule; - - private readonly Random _random; - - public MergedCorpus(INParallelTextCorpus nParallelTextCorpus, MergeRule mergeRule, int seed) - { - _corpus = nParallelTextCorpus; - _mergeRule = mergeRule; - _random = new Random(seed); - } - - public override IEnumerable Texts => _corpus.Corpora.SelectMany(c => c.Texts); - - public override bool IsTokenized => Enumerable.Range(0, _corpus.N).All(i => _corpus.IsTokenized(i)); - - public override ScrVers Versification => _corpus.N > 0 ? _corpus.Corpora[0].Versification : null; - - public override IEnumerable GetRows(IEnumerable textIds) - { - int indexOfInRangeRow = -1; - foreach (NParallelTextRow nRow in _corpus.GetRows(textIds)) - { - IReadOnlyList nonEmptyIndices = nRow - .NSegments.Select((s, i) => (s, i)) - .Where(pair => pair.s.Count > 0 || nRow.IsInRange(pair.i)) - .Select(pair => pair.i) - .ToList(); - IReadOnlyList indices = - nonEmptyIndices.Count > 0 ? nonEmptyIndices : Enumerable.Range(0, nRow.N).ToList(); - if (indexOfInRangeRow == -1) - { - indices = indices.Where(i => nRow.IsRangeStart(i) || !nRow.IsInRange(i)).ToList(); - } - if (indices.Count == 0) - continue; - int indexOfSelectedRow = -1; - switch (_mergeRule) - { - case MergeRule.First: - indexOfSelectedRow = indices.First(); - break; - case MergeRule.Random: - indexOfSelectedRow = indices[_random.Next(0, indices.Count)]; - break; - } - indexOfSelectedRow = indexOfInRangeRow != -1 ? indexOfInRangeRow : indexOfSelectedRow; - if (!nRow.IsInRange(indexOfSelectedRow)) - { - indexOfInRangeRow = -1; - } - if (nRow.IsRangeStart(indexOfSelectedRow)) - { - indexOfInRangeRow = indexOfSelectedRow; - } - yield return new TextRow(nRow.TextId, nRow.Ref) - { - Segment = nRow.NSegments[indexOfSelectedRow], - Flags = nRow.NFlags[indexOfSelectedRow] - }; - } - } + return new MergedTextCorpus( + new NParallelTextCorpus(corpora) { AllRows = allRows.ToArray() }, + MergeRule.First, + 0 + ); } #endregion diff --git a/src/SIL.Machine/Corpora/INParallelTextCorpus.cs b/src/SIL.Machine/Corpora/INParallelTextCorpus.cs index 0dfde2fa3..5a1e86f76 100644 --- a/src/SIL.Machine/Corpora/INParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/INParallelTextCorpus.cs @@ -4,10 +4,6 @@ namespace SIL.Machine.Corpora { public interface INParallelTextCorpus : ICorpus { - int N { get; } - IReadOnlyList Corpora { get; } - - bool IsTokenized(int i); int Count(bool includeEmpty = true, IEnumerable textIds = null); IEnumerable GetRows(IEnumerable textIds); diff --git a/src/SIL.Machine/Corpora/MergeRule.cs b/src/SIL.Machine/Corpora/MergeRule.cs new file mode 100644 index 000000000..be9a2ceec --- /dev/null +++ b/src/SIL.Machine/Corpora/MergeRule.cs @@ -0,0 +1,8 @@ +namespace SIL.Machine.Corpora +{ + public enum MergeRule + { + First, + Random + } +} diff --git a/src/SIL.Machine/Corpora/MergedTextCorpus.cs b/src/SIL.Machine/Corpora/MergedTextCorpus.cs new file mode 100644 index 000000000..890f7e10e --- /dev/null +++ b/src/SIL.Machine/Corpora/MergedTextCorpus.cs @@ -0,0 +1,74 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using SIL.Scripture; + +namespace SIL.Machine.Corpora +{ + public class MergedTextCorpus : TextCorpusBase + { + private readonly NParallelTextCorpus _corpus; + + private readonly MergeRule _mergeRule; + + private readonly Random _random; + + public MergedTextCorpus(NParallelTextCorpus nParallelTextCorpus, MergeRule mergeRule, int seed) + { + _corpus = nParallelTextCorpus; + _mergeRule = mergeRule; + _random = new Random(seed); + } + + public override IEnumerable Texts => _corpus.Corpora.SelectMany(c => c.Texts); + + public override bool IsTokenized => Enumerable.Range(0, _corpus.N).All(i => _corpus.IsTokenized(i)); + + public override ScrVers Versification => _corpus.N > 0 ? _corpus.Corpora[0].Versification : null; + + public override IEnumerable GetRows(IEnumerable textIds) + { + int indexOfInRangeRow = -1; + foreach (NParallelTextRow nRow in _corpus.GetRows(textIds)) + { + IReadOnlyList nonEmptyIndices = nRow + .NSegments.Select((s, i) => (s, i)) + .Where(pair => pair.s.Count > 0 || nRow.IsInRange(pair.i)) + .Select(pair => pair.i) + .ToList(); + IReadOnlyList indices = + nonEmptyIndices.Count > 0 ? nonEmptyIndices : Enumerable.Range(0, nRow.N).ToList(); + if (indexOfInRangeRow == -1) + { + indices = indices.Where(i => nRow.IsRangeStart(i) || !nRow.IsInRange(i)).ToList(); + } + if (indices.Count == 0) + continue; + int indexOfSelectedRow = -1; + switch (_mergeRule) + { + case MergeRule.First: + indexOfSelectedRow = indices.First(); + break; + case MergeRule.Random: + indexOfSelectedRow = indices[_random.Next(0, indices.Count)]; + break; + } + indexOfSelectedRow = indexOfInRangeRow != -1 ? indexOfInRangeRow : indexOfSelectedRow; + if (!nRow.IsInRange(indexOfSelectedRow)) + { + indexOfInRangeRow = -1; + } + if (nRow.IsRangeStart(indexOfSelectedRow)) + { + indexOfInRangeRow = indexOfSelectedRow; + } + yield return new TextRow(nRow.TextId, nRow.Ref) + { + Segment = nRow.NSegments[indexOfSelectedRow], + Flags = nRow.NFlags[indexOfSelectedRow] + }; + } + } + } +} diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index ae4c6e1f7..9b5bf7777 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -106,9 +106,18 @@ private IEnumerable GetRows(IList> enumer RowRefComparer = RowRefComparer }; - bool[] completed = enumerators.Select(e => !e.MoveNext()).ToArray(); + bool[] completed = new bool[N]; + int numCompleted = 0; + for (int i = 0; i < N; i++) + { + bool isCompleted = !enumerators[i].MoveNext(); + completed[i] = isCompleted; + if (isCompleted) + numCompleted++; + } + int numberOfRemainingRows = N - numCompleted; - while (!completed.All(c => c)) + while (numCompleted < N) { List minRefIndexes; List currentRows = enumerators.Select(e => e.Current).ToList(); @@ -133,7 +142,6 @@ private IEnumerable GetRows(IList> enumer throw new CorpusAlignmentException(currentRows.Select(e => e.Ref.ToString()).ToArray()); } List nonMinRefIndexes = Enumerable.Range(0, N).Except(minRefIndexes).ToList(); - int numberOfRemainingRows = N - completed.Count(c => c); if (minRefIndexes.Count < numberOfRemainingRows || minRefIndexes.Count(i => !completed[i]) == 1) //then there are some non-min refs or only one incomplete enumerator { @@ -172,9 +180,9 @@ NParallelTextRow row in CreateMinRefRows( anyNonMinEnumeratorsMidRange && nonMinRefIndexes.All(j => !completed[j] && currentRows[j].TextId == currentRows[i].TextId - ) + ) //All non-min rows have the same textId as the given min row ) - .ToList() //TODO refactor + .ToList() ) ) { @@ -184,7 +192,13 @@ NParallelTextRow row in CreateMinRefRows( foreach (int i in minRefIndexes) { rangeInfo.Rows[i].SameRefRows.Add(enumerators[i].Current); - completed[i] = !enumerators[i].MoveNext(); + bool isCompleted = !enumerators[i].MoveNext(); + completed[i] = isCompleted; + if (isCompleted) + { + numCompleted++; + numberOfRemainingRows--; + } } } else if (minRefIndexes.Count == numberOfRemainingRows) @@ -212,31 +226,11 @@ NParallelTextRow row in CreateMinRefRows( } else { - for (int i = 0; i < rangeInfo.Rows.Count; i++) + foreach (NParallelTextRow row in CreateSameRefRows(rangeInfo, completed, currentRows)) { - if (completed[i]) - continue; - - for (int j = 0; j < rangeInfo.Rows.Count; j++) - { - if (i == j || completed[j]) - continue; - - if (CheckSameRefRows(rangeInfo.Rows[i].SameRefRows, currentRows[j])) - { - foreach (TextRow tr in rangeInfo.Rows[i].SameRefRows) - { - var textRows = new TextRow[N]; - textRows[i] = tr; - textRows[j] = currentRows[j]; - foreach (NParallelTextRow r in CreateRows(rangeInfo, textRows)) - { - yield return r; - } - } - } - } + yield return row; } + foreach ( NParallelTextRow row in CreateRows( rangeInfo, @@ -251,7 +245,13 @@ NParallelTextRow row in CreateRows( for (int i = 0; i < rangeInfo.Rows.Count; i++) { rangeInfo.Rows[i].SameRefRows.Add(currentRows[i]); - completed[i] = !enumerators[i].MoveNext(); + bool isCompleted = !enumerators[i].MoveNext(); + completed[i] = isCompleted; + if (isCompleted) + { + numCompleted++; + numberOfRemainingRows--; + } } } else @@ -338,8 +338,6 @@ private IEnumerable CreateMinRefRows( TextRow textRow = currentRows[i]; foreach ((List sameRefRows, int j) in sameRefRowsPerIndex) { - if (i == j) - continue; if (CheckSameRefRows(sameRefRows, textRow)) { alreadyYielded.Add(i); @@ -391,6 +389,39 @@ private bool CheckSameRefRows(IList sameRefRows, TextRow otherRow) return sameRefRows.Count > 0; } + private IEnumerable CreateSameRefRows( + NRangeInfo rangeInfo, + IList completed, + IList currentRows + ) + { + for (int i = 0; i < rangeInfo.Rows.Count; i++) + { + if (completed[i]) + continue; + + for (int j = 0; j < rangeInfo.Rows.Count; j++) + { + if (i == j || completed[j]) + continue; + + if (CheckSameRefRows(rangeInfo.Rows[i].SameRefRows, currentRows[j])) + { + foreach (TextRow tr in rangeInfo.Rows[i].SameRefRows) + { + var textRows = new TextRow[N]; + textRows[i] = tr; + textRows[j] = currentRows[j]; + foreach (NParallelTextRow r in CreateRows(rangeInfo, textRows)) + { + yield return r; + } + } + } + } + } + } + private class RangeRow { public IList Refs { get; } = new List(); diff --git a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs index 2f8ec3a57..db5e85ac5 100644 --- a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs @@ -94,8 +94,7 @@ public void MergedCorpus_SelectFirst() } ) ); - var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [true, true, true] }; - var mergedCorpus = nParallelCorpus.ChooseFirst(); + var mergedCorpus = new List { corpus1, corpus2, corpus3 }.ChooseFirst([true, true, true]); var rows = mergedCorpus.ToArray(); Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); Assert.That(rows[0].Text, Is.EqualTo("source 1 segment 1 .")); @@ -139,8 +138,7 @@ public void MergedCorpus_SelectRandom_Seed123456() } ) ); - var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [true, true, true] }; - var mergedCorpus = nParallelCorpus.ChooseRandom(123456); + var mergedCorpus = new List { corpus1, corpus2, corpus3 }.ChooseRandom([true, true, true], 123456); var rows = mergedCorpus.ToArray(); Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); Assert.Multiple(() => @@ -187,8 +185,7 @@ public void MergedCorpus_SelectRandom_Seed4501() } ) ); - var nParallelCorpus = new NParallelTextCorpus([corpus1, corpus2, corpus3]) { AllRows = [true, true, true] }; - var mergedCorpus = nParallelCorpus.ChooseRandom(4501); + var mergedCorpus = new List { corpus1, corpus2, corpus3 }.ChooseRandom([true, true, true], 4501); var rows = mergedCorpus.ToArray(); Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); Assert.Multiple(() => @@ -236,9 +233,9 @@ public void AlignMergedCorpora() ) ); - ITextCorpus sourceCorpus = (new ITextCorpus[] { sourceCorpus1, sourceCorpus1, sourceCorpus3 }) - .AlignMany([true, true, true]) - .ChooseFirst(); + ITextCorpus sourceCorpus = new List { sourceCorpus1, sourceCorpus2, sourceCorpus3 }.ChooseFirst( + [true, true, true] + ); var targetCorpus1 = new DictionaryTextCorpus( new MemoryText( @@ -274,9 +271,9 @@ public void AlignMergedCorpora() ) ); - ITextCorpus targetCorpus = (new ITextCorpus[] { targetCorpus1, targetCorpus2, targetCorpus3 }) - .AlignMany([true, true, true]) - .ChooseFirst(); + ITextCorpus targetCorpus = new List { targetCorpus1, targetCorpus2, targetCorpus3 }.ChooseFirst( + [true, true, true] + ); IParallelTextCorpus alignedCorpus = sourceCorpus.AlignRows(targetCorpus); ParallelTextRow[] rows = alignedCorpus.GetRows().ToArray(); From 2058d4d0a78c9132e08f1e3a3d840f20f8a1b8e1 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 15 Nov 2024 10:27:51 -0500 Subject: [PATCH 22/26] Change mergedtextcorpus parameter --- src/SIL.Machine/Corpora/CorporaExtensions.cs | 12 ++---------- src/SIL.Machine/Corpora/MergedTextCorpus.cs | 9 +++++++-- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/src/SIL.Machine/Corpora/CorporaExtensions.cs b/src/SIL.Machine/Corpora/CorporaExtensions.cs index 33186799c..f5e1b4b12 100644 --- a/src/SIL.Machine/Corpora/CorporaExtensions.cs +++ b/src/SIL.Machine/Corpora/CorporaExtensions.cs @@ -544,20 +544,12 @@ public static ITextCorpus ChooseRandom( int seed ) { - return new MergedTextCorpus( - new NParallelTextCorpus(corpora) { AllRows = allRows.ToArray() }, - MergeRule.Random, - seed - ); + return new MergedTextCorpus(corpora, allRows.ToArray(), MergeRule.Random, seed); } public static ITextCorpus ChooseFirst(this IEnumerable corpora, IEnumerable allRows) { - return new MergedTextCorpus( - new NParallelTextCorpus(corpora) { AllRows = allRows.ToArray() }, - MergeRule.First, - 0 - ); + return new MergedTextCorpus(corpora, allRows.ToArray(), MergeRule.First, 0); } #endregion diff --git a/src/SIL.Machine/Corpora/MergedTextCorpus.cs b/src/SIL.Machine/Corpora/MergedTextCorpus.cs index 890f7e10e..1bcbf822f 100644 --- a/src/SIL.Machine/Corpora/MergedTextCorpus.cs +++ b/src/SIL.Machine/Corpora/MergedTextCorpus.cs @@ -13,9 +13,14 @@ public class MergedTextCorpus : TextCorpusBase private readonly Random _random; - public MergedTextCorpus(NParallelTextCorpus nParallelTextCorpus, MergeRule mergeRule, int seed) + public MergedTextCorpus( + IEnumerable corpora, + IEnumerable allRows, + MergeRule mergeRule, + int seed + ) { - _corpus = nParallelTextCorpus; + _corpus = new NParallelTextCorpus(corpora) { AllRows = allRows.ToList() }; _mergeRule = mergeRule; _random = new Random(seed); } From 6ab2faa5876656c76756191aa8da4070b1562d81 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 15 Nov 2024 11:47:32 -0500 Subject: [PATCH 23/26] More reviewer-requested changes --- src/SIL.Machine/Corpora/CorporaExtensions.cs | 12 ++++-------- src/SIL.Machine/Corpora/MergedTextCorpus.cs | 14 ++++++-------- .../Corpora/CorporaExtensionsTests.cs | 14 +++++--------- 3 files changed, 15 insertions(+), 25 deletions(-) diff --git a/src/SIL.Machine/Corpora/CorporaExtensions.cs b/src/SIL.Machine/Corpora/CorporaExtensions.cs index f5e1b4b12..e20be3f7c 100644 --- a/src/SIL.Machine/Corpora/CorporaExtensions.cs +++ b/src/SIL.Machine/Corpora/CorporaExtensions.cs @@ -538,18 +538,14 @@ public static INParallelTextCorpus AlignMany( return nParallelTextCorpus; } - public static ITextCorpus ChooseRandom( - this IEnumerable corpora, - IEnumerable allRows, - int seed - ) + public static ITextCorpus ChooseRandom(this IEnumerable corpora, int seed) { - return new MergedTextCorpus(corpora, allRows.ToArray(), MergeRule.Random, seed); + return new MergedTextCorpus(corpora, MergeRule.Random, seed); } - public static ITextCorpus ChooseFirst(this IEnumerable corpora, IEnumerable allRows) + public static ITextCorpus ChooseFirst(this IEnumerable corpora) { - return new MergedTextCorpus(corpora, allRows.ToArray(), MergeRule.First, 0); + return new MergedTextCorpus(corpora, MergeRule.First); } #endregion diff --git a/src/SIL.Machine/Corpora/MergedTextCorpus.cs b/src/SIL.Machine/Corpora/MergedTextCorpus.cs index 1bcbf822f..5e85b60bf 100644 --- a/src/SIL.Machine/Corpora/MergedTextCorpus.cs +++ b/src/SIL.Machine/Corpora/MergedTextCorpus.cs @@ -13,16 +13,14 @@ public class MergedTextCorpus : TextCorpusBase private readonly Random _random; - public MergedTextCorpus( - IEnumerable corpora, - IEnumerable allRows, - MergeRule mergeRule, - int seed - ) + public MergedTextCorpus(IEnumerable corpora, MergeRule mergeRule, int? seed = null) { - _corpus = new NParallelTextCorpus(corpora) { AllRows = allRows.ToList() }; + _corpus = new NParallelTextCorpus(corpora) { AllRows = Enumerable.Repeat(true, corpora.Count()).ToArray() }; _mergeRule = mergeRule; - _random = new Random(seed); + if (seed != null) + _random = new Random(seed.Value); + else + _random = new Random(); } public override IEnumerable Texts => _corpus.Corpora.SelectMany(c => c.Texts); diff --git a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs index db5e85ac5..836f3bdaf 100644 --- a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs @@ -94,7 +94,7 @@ public void MergedCorpus_SelectFirst() } ) ); - var mergedCorpus = new List { corpus1, corpus2, corpus3 }.ChooseFirst([true, true, true]); + var mergedCorpus = new List { corpus1, corpus2, corpus3 }.ChooseFirst(); var rows = mergedCorpus.ToArray(); Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); Assert.That(rows[0].Text, Is.EqualTo("source 1 segment 1 .")); @@ -138,7 +138,7 @@ public void MergedCorpus_SelectRandom_Seed123456() } ) ); - var mergedCorpus = new List { corpus1, corpus2, corpus3 }.ChooseRandom([true, true, true], 123456); + var mergedCorpus = new List { corpus1, corpus2, corpus3 }.ChooseRandom(123456); var rows = mergedCorpus.ToArray(); Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); Assert.Multiple(() => @@ -185,7 +185,7 @@ public void MergedCorpus_SelectRandom_Seed4501() } ) ); - var mergedCorpus = new List { corpus1, corpus2, corpus3 }.ChooseRandom([true, true, true], 4501); + var mergedCorpus = new List { corpus1, corpus2, corpus3 }.ChooseRandom(4501); var rows = mergedCorpus.ToArray(); Assert.That(rows, Has.Length.EqualTo(3), JsonSerializer.Serialize(rows)); Assert.Multiple(() => @@ -233,9 +233,7 @@ public void AlignMergedCorpora() ) ); - ITextCorpus sourceCorpus = new List { sourceCorpus1, sourceCorpus2, sourceCorpus3 }.ChooseFirst( - [true, true, true] - ); + ITextCorpus sourceCorpus = new List { sourceCorpus1, sourceCorpus2, sourceCorpus3 }.ChooseFirst(); var targetCorpus1 = new DictionaryTextCorpus( new MemoryText( @@ -271,9 +269,7 @@ public void AlignMergedCorpora() ) ); - ITextCorpus targetCorpus = new List { targetCorpus1, targetCorpus2, targetCorpus3 }.ChooseFirst( - [true, true, true] - ); + ITextCorpus targetCorpus = new List { targetCorpus1, targetCorpus2, targetCorpus3 }.ChooseFirst(); IParallelTextCorpus alignedCorpus = sourceCorpus.AlignRows(targetCorpus); ParallelTextRow[] rows = alignedCorpus.GetRows().ToArray(); From 6ca6027167209d2a008a565a2b51890aebb52959 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 15 Nov 2024 12:46:08 -0500 Subject: [PATCH 24/26] Move same ref rows out of range info --- .../Corpora/NParallelTextCorpus.cs | 39 +++++++++++-------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index 9b5bf7777..899a9cf92 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -105,6 +105,11 @@ private IEnumerable GetRows(IList> enumer Versifications = Corpora.Select(c => c.Versification).ToArray(), RowRefComparer = RowRefComparer }; + List> sameRefRows = new List>(); + for (int i = 0; i < N; i++) + { + sameRefRows.Add(new List()); + } bool[] completed = new bool[N]; int numCompleted = 0; @@ -162,7 +167,7 @@ private IEnumerable GetRows(IList> enumer foreach (int i in minRefIndexes) rangeInfo.AddTextRow(enumerators[i].Current, i); foreach (int i in nonMinRefIndexes) - rangeInfo.Rows[i].SameRefRows.Clear(); + sameRefRows[i].Clear(); } else { @@ -175,6 +180,7 @@ NParallelTextRow row in CreateMinRefRows( currentRows.ToArray(), minRefIndexes.ToArray(), nonMinRefIndexes.ToArray(), + sameRefRows, forceInRange: minRefIndexes .Select(i => anyNonMinEnumeratorsMidRange @@ -191,7 +197,7 @@ NParallelTextRow row in CreateMinRefRows( } foreach (int i in minRefIndexes) { - rangeInfo.Rows[i].SameRefRows.Add(enumerators[i].Current); + sameRefRows[i].Add(enumerators[i].Current); bool isCompleted = !enumerators[i].MoveNext(); completed[i] = isCompleted; if (isCompleted) @@ -221,12 +227,14 @@ NParallelTextRow row in CreateMinRefRows( for (int i = 0; i < rangeInfo.Rows.Count; i++) { rangeInfo.AddTextRow(currentRows[i], i); - rangeInfo.Rows[i].SameRefRows.Clear(); + sameRefRows[i].Clear(); } } else { - foreach (NParallelTextRow row in CreateSameRefRows(rangeInfo, completed, currentRows)) + foreach ( + NParallelTextRow row in CreateSameRefRows(rangeInfo, completed, currentRows, sameRefRows) + ) { yield return row; } @@ -244,7 +252,7 @@ NParallelTextRow row in CreateRows( for (int i = 0; i < rangeInfo.Rows.Count; i++) { - rangeInfo.Rows[i].SameRefRows.Add(currentRows[i]); + sameRefRows[i].Add(currentRows[i]); bool isCompleted = !enumerators[i].MoveNext(); completed[i] = isCompleted; if (isCompleted) @@ -323,21 +331,18 @@ private IEnumerable CreateMinRefRows( IReadOnlyList currentRows, IReadOnlyList minRefIndexes, IReadOnlyList nonMinRefIndexes, + IReadOnlyList> sameRefRowsPerIndex, IReadOnlyList forceInRange = null ) { - List<(List Rows, int Index)> sameRefRowsPerIndex = nonMinRefIndexes - .Select(i => (rangeInfo.Rows[i], i)) - .Select(pair => (pair.Item1.SameRefRows.ToList(), pair.Item2)) - .ToList(); - HashSet alreadyYielded = new HashSet(); TextRow[] textRows; foreach (int i in minRefIndexes) { TextRow textRow = currentRows[i]; - foreach ((List sameRefRows, int j) in sameRefRowsPerIndex) + foreach (int j in nonMinRefIndexes) { + IList sameRefRows = sameRefRowsPerIndex[j]; if (CheckSameRefRows(sameRefRows, textRow)) { alreadyYielded.Add(i); @@ -392,22 +397,23 @@ private bool CheckSameRefRows(IList sameRefRows, TextRow otherRow) private IEnumerable CreateSameRefRows( NRangeInfo rangeInfo, IList completed, - IList currentRows + IList currentRows, + IReadOnlyList> sameRefRows ) { - for (int i = 0; i < rangeInfo.Rows.Count; i++) + for (int i = 0; i < N; i++) { if (completed[i]) continue; - for (int j = 0; j < rangeInfo.Rows.Count; j++) + for (int j = 0; j < N; j++) { if (i == j || completed[j]) continue; - if (CheckSameRefRows(rangeInfo.Rows[i].SameRefRows, currentRows[j])) + if (CheckSameRefRows(sameRefRows[i], currentRows[j])) { - foreach (TextRow tr in rangeInfo.Rows[i].SameRefRows) + foreach (TextRow tr in sameRefRows[i]) { var textRows = new TextRow[N]; textRows[i] = tr; @@ -426,7 +432,6 @@ private class RangeRow { public IList Refs { get; } = new List(); public IList Segment { get; } = new List(); - public IList SameRefRows { get; } = new List(); public bool IsSentenceStart { get; set; } = false; public bool IsInRange => Refs.Count > 0; public bool IsEmpty => Segment.Count == 0; From 5376918aac75fbcfb5ffd5f32a5a92e6bd104361 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 18 Nov 2024 16:19:39 -0500 Subject: [PATCH 25/26] Make seed optional; remove unneeded code --- src/SIL.Machine/Corpora/CorporaExtensions.cs | 2 +- .../Corpora/NParallelTextCorpus.cs | 9 ---- .../Corpora/ParallelTextCorpusTests.cs | 49 +++++++++++++++++++ 3 files changed, 50 insertions(+), 10 deletions(-) diff --git a/src/SIL.Machine/Corpora/CorporaExtensions.cs b/src/SIL.Machine/Corpora/CorporaExtensions.cs index e20be3f7c..372ba2014 100644 --- a/src/SIL.Machine/Corpora/CorporaExtensions.cs +++ b/src/SIL.Machine/Corpora/CorporaExtensions.cs @@ -538,7 +538,7 @@ public static INParallelTextCorpus AlignMany( return nParallelTextCorpus; } - public static ITextCorpus ChooseRandom(this IEnumerable corpora, int seed) + public static ITextCorpus ChooseRandom(this IEnumerable corpora, int? seed = null) { return new MergedTextCorpus(corpora, MergeRule.Random, seed); } diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index 899a9cf92..c8f2c9042 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -155,15 +155,6 @@ private IEnumerable GetRows(IList> enumer && minRefIndexes.Any(i => !completed[i] && currentRows[i].IsInRange) //and at least one of the min rows is not completed and in a range ) { - if ( - rangeInfo.IsInRange - && nonMinRefIndexes.Any(i => - !completed[i] && currentRows[i].IsInRange && !currentRows[i].IsEmpty - ) //At least one of the non-min rows is not completed, is in a range, and has content - ) - { - yield return rangeInfo.CreateRow(); - } foreach (int i in minRefIndexes) rangeInfo.AddTextRow(enumerators[i].Current, i); foreach (int i in nonMinRefIndexes) diff --git a/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs b/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs index d40529c65..2c5c0a90d 100644 --- a/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs @@ -444,6 +444,55 @@ public void GetRows_OverlappingRanges() Assert.That(rows[0].IsTargetSentenceStart, Is.True); } + [Test] + public void GetRows_OverlappingRangesAndMissingRow() + { + var sourceCorpus = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow( + "text1", + 1, + "source segment 1 . source segment 2 . source segment 3 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 2, flags: TextRowFlags.InRange), + TextRow("text1", 3, flags: TextRowFlags.InRange) + } + ) + ); + var targetCorpus = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow( + "text1", + 3, + "target segment 3 . target segment 4 .", + TextRowFlags.SentenceStart | TextRowFlags.InRange | TextRowFlags.RangeStart + ), + TextRow("text1", 4, flags: TextRowFlags.InRange) + } + ) + ); + + var parallelCorpus = new ParallelTextCorpus(sourceCorpus, targetCorpus); + ParallelTextRow[] rows = parallelCorpus.ToArray(); + Assert.That(rows.Length, Is.EqualTo(1)); + Assert.That(rows[0].SourceRefs, Is.EqualTo(new[] { 1, 2, 3 })); + Assert.That(rows[0].TargetRefs, Is.EqualTo(new[] { 3, 4 })); + Assert.That( + rows[0].SourceSegment, + Is.EqualTo("source segment 1 . source segment 2 . source segment 3 .".Split()) + ); + Assert.That(rows[0].TargetSegment, Is.EqualTo("target segment 3 . target segment 4 .".Split())); + Assert.That(rows[0].IsSourceSentenceStart, Is.True); + Assert.That(rows[0].IsTargetSentenceStart, Is.True); + } + [Test] public void GetRows_AdjacentRangesSameText() { From 4633385f892626eb5e06fb595ef84a4d157cfa04 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 18 Nov 2024 16:25:58 -0500 Subject: [PATCH 26/26] Fix typo --- tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs b/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs index 2c5c0a90d..8df6d7873 100644 --- a/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/ParallelTextCorpusTests.cs @@ -445,7 +445,7 @@ public void GetRows_OverlappingRanges() } [Test] - public void GetRows_OverlappingRangesAndMissingRow() + public void GetRows_OverlappingRangesAndMissingRows() { var sourceCorpus = new DictionaryTextCorpus( new MemoryText(