From e0fba037288b65b2afec3ae46e151fa124935512 Mon Sep 17 00:00:00 2001 From: John Lambert Date: Wed, 23 Oct 2024 10:42:11 -0400 Subject: [PATCH] broken --- .../Corpora/CorpusAlignmentException.cs | 5 + .../Corpora/INParallelTextCorpus.cs | 11 + .../Corpora/NParallelTextCorpus.cs | 658 ++++++++++++++++++ .../Corpora/NParallelTextCorpusBase.cs | 36 + src/SIL.Machine/Corpora/NParallelTextRow.cs | 54 ++ .../Corpora/ParallelCorpusEnumerator.cs | 126 ++++ 6 files changed, 890 insertions(+) create mode 100644 src/SIL.Machine/Corpora/INParallelTextCorpus.cs create mode 100644 src/SIL.Machine/Corpora/NParallelTextCorpus.cs create mode 100644 src/SIL.Machine/Corpora/NParallelTextCorpusBase.cs create mode 100644 src/SIL.Machine/Corpora/NParallelTextRow.cs create mode 100644 src/SIL.Machine/Corpora/ParallelCorpusEnumerator.cs diff --git a/src/SIL.Machine/Corpora/CorpusAlignmentException.cs b/src/SIL.Machine/Corpora/CorpusAlignmentException.cs index c86dd8cfd..2b8129858 100644 --- a/src/SIL.Machine/Corpora/CorpusAlignmentException.cs +++ b/src/SIL.Machine/Corpora/CorpusAlignmentException.cs @@ -8,5 +8,10 @@ public CorpusAlignmentException(string sourceRef, string targetRef) : base( $"Invalid format in {sourceRef} and {targetRef}. Mismatched key formats \"{sourceRef}\" and \"{targetRef}\". There may be an extraneous tab, missing ref, or inconsistent use of user-defined refs." ) { } + + public CorpusAlignmentException(string[] refs) + : base( + $"Invalid format in {string.Join(", ", refs)}. Mismatched key formats. There may be an extraneous tab, missing ref, or inconsistent use of user-defined refs." + ) { } } } diff --git a/src/SIL.Machine/Corpora/INParallelTextCorpus.cs b/src/SIL.Machine/Corpora/INParallelTextCorpus.cs new file mode 100644 index 000000000..5a1e86f76 --- /dev/null +++ b/src/SIL.Machine/Corpora/INParallelTextCorpus.cs @@ -0,0 +1,11 @@ +using System.Collections.Generic; + +namespace SIL.Machine.Corpora +{ + public interface INParallelTextCorpus : ICorpus + { + int Count(bool includeEmpty = true, IEnumerable textIds = null); + + IEnumerable GetRows(IEnumerable textIds); + } +} diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs new file mode 100644 index 000000000..e38d12383 --- /dev/null +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -0,0 +1,658 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Linq; +using SIL.Linq; +using SIL.ObjectModel; +using SIL.Scripture; + +namespace SIL.Machine.Corpora +{ + public class NParallelTextCorpus : NParallelTextCorpusBase + { + public NParallelTextCorpus(IEnumerable corpora, IComparer rowRefComparer = null) + { + Corpora = corpora.ToImmutableArray(); + if (Corpora.Count < 1) + throw new ArgumentException("There must be at least one corpora.", nameof(corpora)); + RowRefComparer = rowRefComparer ?? new DefaultRowRefComparer(); + AllRowsList = new bool[Corpora.Count] + .Select(_ => false) + .ToImmutableArray(); + } + + public bool GetIsTokenized(int i) => + i < Corpora.Count ? Corpora[i].IsTokenized : throw new ArgumentOutOfRangeException(nameof(i)); + + public int N => Corpora.Count; + + public IReadOnlyList AllRowsList { get; set; } + public IReadOnlyList Corpora { get; } + public IComparer RowRefComparer { get; } + + private static HashSet GetTextIdsFromCorpora( + IEnumerable corpora, + IEnumerable allRowsEnumerate + ) + { + IReadOnlyList> textIdListOfLists = corpora + .Select(c => c.Texts.Select(t => t.Id)) + .ToImmutableArray(); + + HashSet textIds = textIdListOfLists + .Skip(1) + .Aggregate( + new HashSet(textIdListOfLists.First()), + (h, e) => + { + h.IntersectWith(e); + return h; + } + ); + allRowsEnumerate + .Select((allRows, i) => (allRows, i)) + .Where(t => t.allRows) + .ForEach(t => textIds.UnionWith(textIdListOfLists[t.i])); + return textIds; + } + + public override IEnumerable GetRows(IEnumerable textIds) + { + HashSet filterTextIds = GetTextIdsFromCorpora(Corpora, AllRowsList); + + if (textIds != null) + filterTextIds.IntersectWith(textIds); + + IList> enumeratedCorpora = new List>(); + try + { + for (int i = 0; i < Corpora.Count; i++) + { + if (i == 0) + { + enumeratedCorpora.Add(Corpora[0].GetRows(filterTextIds).GetEnumerator()); + } + else + { + enumeratedCorpora.Add( + new ParallelCorpusEnumerator( + Corpora[i].GetRows(filterTextIds).GetEnumerator(), + Corpora[0].Versification, + Corpora[i].Versification + ) + ); + } + } + return GetRows(enumeratedCorpora); + } + finally + { + foreach (IEnumerator enumerator in enumeratedCorpora) + { + enumerator.Dispose(); + } + } + } + + private IList MinRefIndexes(IList refs) + { + object minRef = refs[0]; + IList minRefIndexes = new List(0); + for (int i = 1; i < refs.Count; i++) + { + if (RowRefComparer.Compare(refs[i], minRef) < 0) + { + minRef = refs[i]; + minRefIndexes.Clear(); + minRefIndexes.Add(i); + } + else if (RowRefComparer.Compare(refs[i], minRef) == 0) + { + minRefIndexes.Add(i); + } + } + return minRefIndexes; + } + + private IEnumerable GetRows(IList> enumerators) + { + { + var rangeInfo = new NRangeInfo { Versification = Corpora[0].Versification }; + + List[] sameRefRows = new List[Corpora.Count]; + bool[] completed = enumerators.Select(e => !e.MoveNext()).ToArray(); + + while (!completed.Any()) + { + IList minRefIndexes; + IList currentRefs = enumerators.Select(e => e.Current.Ref).ToArray(); + try + { + minRefIndexes = MinRefIndexes(currentRefs); + } + catch (ArgumentException) + { + throw new CorpusAlignmentException(currentRefs.Select(r => r.ToString()).ToArray()); + } + if (minRefIndexes.Count == N) + { + if ( + (!AllTargetRows && srcEnumerator.Current.IsInRange) + || (!AllSourceRows && trgEnumerator.Current.IsInRange) + ) + { + if ( + rangeInfo.IsInRange + && ( + ( + srcEnumerator.Current.IsInRange + && !trgEnumerator.Current.IsInRange + && srcEnumerator.Current.Segment.Count > 0 + ) + || ( + !srcEnumerator.Current.IsInRange + && trgEnumerator.Current.IsInRange + && trgEnumerator.Current.Segment.Count > 0 + ) + || ( + srcEnumerator.Current.IsInRange + && trgEnumerator.Current.IsInRange + && srcEnumerator.Current.Segment.Count > 0 + && trgEnumerator.Current.Segment.Count > 0 + ) + ) + ) + { + yield return rangeInfo.CreateRow(); + } + + rangeInfo.TextId = srcEnumerator.Current.TextId; + rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); + rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); + sourceSameRefRows.Clear(); + targetSameRefRows.Clear(); + if (rangeInfo.IsSourceEmpty) + rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; + if (rangeInfo.IsTargetEmpty) + rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; + rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); + rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); + } + else + { + if (CheckSameRefRows(sourceSameRefRows, trgEnumerator.Current)) + { + foreach (TextRow prevSourceRow in sourceSameRefRows) + { + foreach ( + ParallelTextRow row in CreateRows( + rangeInfo, + prevSourceRow, + trgEnumerator.Current + ) + ) + { + yield return row; + } + } + } + + if (CheckSameRefRows(targetSameRefRows, srcEnumerator.Current)) + { + foreach (TextRow prevTargetRow in targetSameRefRows) + { + foreach ( + ParallelTextRow row in CreateRows( + rangeInfo, + srcEnumerator.Current, + prevTargetRow + ) + ) + { + yield return row; + } + } + } + + foreach ( + ParallelTextRow row in CreateRows( + rangeInfo, + srcEnumerator.Current, + trgEnumerator.Current, + compare2 == 0 ? alignmentEnumerator.Current.AlignedWordPairs : null + ) + ) + { + yield return row; + } + } + + sourceSameRefRows.Add(srcEnumerator.Current); + srcCompleted = !srcEnumerator.MoveNext(); + + targetSameRefRows.Add(trgEnumerator.Current); + trgCompleted = !trgEnumerator.MoveNext(); + } + if (compare < 0) + { + if (!AllTargetRows && srcEnumerator.Current.IsInRange) + { + if ( + rangeInfo.IsInRange + && trgEnumerator.Current.IsInRange + && trgEnumerator.Current.Segment.Count > 0 + ) + { + yield return rangeInfo.CreateRow(); + } + rangeInfo.TextId = srcEnumerator.Current.TextId; + rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); + targetSameRefRows.Clear(); + if (rangeInfo.IsSourceEmpty) + rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; + rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); + } + else + { + foreach ( + ParallelTextRow row in CreateSourceRows( + rangeInfo, + srcEnumerator.Current, + targetSameRefRows, + forceTargetInRange: srcEnumerator.Current.TextId == trgEnumerator.Current.TextId + && !trgEnumerator.Current.IsRangeStart + && trgEnumerator.Current.IsInRange + ) + ) + { + yield return row; + } + } + + sourceSameRefRows.Add(srcEnumerator.Current); + srcCompleted = !srcEnumerator.MoveNext(); + } + else if (compare > 0) + { + if (!AllSourceRows && trgEnumerator.Current.IsInRange) + { + if ( + rangeInfo.IsInRange + && srcEnumerator.Current.IsInRange + && srcEnumerator.Current.Segment.Count > 0 + ) + { + yield return rangeInfo.CreateRow(); + } + rangeInfo.TextId = trgEnumerator.Current.TextId; + rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); + sourceSameRefRows.Clear(); + if (rangeInfo.IsTargetEmpty) + rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; + rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); + } + else + { + foreach ( + ParallelTextRow row in CreateTargetRows( + rangeInfo, + trgEnumerator.Current, + sourceSameRefRows, + forceSourceInRange: trgEnumerator.Current.TextId == srcEnumerator.Current.TextId + && !srcEnumerator.Current.IsRangeStart + && srcEnumerator.Current.IsInRange + ) + ) + { + yield return row; + } + } + + targetSameRefRows.Add(trgEnumerator.Current); + trgCompleted = !trgEnumerator.MoveNext(); + } + else + // compare == 0 - the refs are the same + { + if ( + (!AllTargetRows && srcEnumerator.Current.IsInRange) + || (!AllSourceRows && trgEnumerator.Current.IsInRange) + ) + { + if ( + rangeInfo.IsInRange + && ( + ( + srcEnumerator.Current.IsInRange + && !trgEnumerator.Current.IsInRange + && srcEnumerator.Current.Segment.Count > 0 + ) + || ( + !srcEnumerator.Current.IsInRange + && trgEnumerator.Current.IsInRange + && trgEnumerator.Current.Segment.Count > 0 + ) + || ( + srcEnumerator.Current.IsInRange + && trgEnumerator.Current.IsInRange + && srcEnumerator.Current.Segment.Count > 0 + && trgEnumerator.Current.Segment.Count > 0 + ) + ) + ) + { + yield return rangeInfo.CreateRow(); + } + + rangeInfo.TextId = srcEnumerator.Current.TextId; + rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); + rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); + sourceSameRefRows.Clear(); + targetSameRefRows.Clear(); + if (rangeInfo.IsSourceEmpty) + rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; + if (rangeInfo.IsTargetEmpty) + rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; + rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); + rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); + } + else + { + if (CheckSameRefRows(sourceSameRefRows, trgEnumerator.Current)) + { + foreach (TextRow prevSourceRow in sourceSameRefRows) + { + foreach ( + ParallelTextRow row in CreateRows( + rangeInfo, + prevSourceRow, + trgEnumerator.Current + ) + ) + { + yield return row; + } + } + } + + if (CheckSameRefRows(targetSameRefRows, srcEnumerator.Current)) + { + foreach (TextRow prevTargetRow in targetSameRefRows) + { + foreach ( + ParallelTextRow row in CreateRows( + rangeInfo, + srcEnumerator.Current, + prevTargetRow + ) + ) + { + yield return row; + } + } + } + + foreach ( + ParallelTextRow row in CreateRows( + rangeInfo, + srcEnumerator.Current, + trgEnumerator.Current, + compare2 == 0 ? alignmentEnumerator.Current.AlignedWordPairs : null + ) + ) + { + yield return row; + } + } + + sourceSameRefRows.Add(srcEnumerator.Current); + srcCompleted = !srcEnumerator.MoveNext(); + + targetSameRefRows.Add(trgEnumerator.Current); + trgCompleted = !trgEnumerator.MoveNext(); + } + } + + while (!srcCompleted) + { + if (!AllTargetRows && srcEnumerator.Current.IsInRange) + { + rangeInfo.TextId = srcEnumerator.Current.TextId; + rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); + targetSameRefRows.Clear(); + if (rangeInfo.IsSourceEmpty) + rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; + rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); + } + else + { + foreach ( + ParallelTextRow row in CreateSourceRows(rangeInfo, srcEnumerator.Current, targetSameRefRows) + ) + { + yield return row; + } + } + srcCompleted = !srcEnumerator.MoveNext(); + } + + while (!trgCompleted) + { + if (!AllSourceRows && trgEnumerator.Current.IsInRange) + { + rangeInfo.TextId = trgEnumerator.Current.TextId; + rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); + sourceSameRefRows.Clear(); + if (rangeInfo.IsTargetEmpty) + rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; + rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); + } + else + { + foreach ( + ParallelTextRow row in CreateTargetRows(rangeInfo, trgEnumerator.Current, sourceSameRefRows) + ) + { + yield return row; + } + } + trgCompleted = !trgEnumerator.MoveNext(); + } + + if (rangeInfo.IsInRange) + yield return rangeInfo.CreateRow(); + } + } + + private IEnumerable CreateRows( + RangeInfo rangeInfo, + TextRow srcRow, + TextRow trgRow, + IReadOnlyCollection alignedWordPairs = null, + bool forceSourceInRange = false, + bool forceTargetInRange = false + ) + { + if (rangeInfo.IsInRange) + yield return rangeInfo.CreateRow(); + + string textId; + if (srcRow != null) + textId = srcRow.TextId; + else if (trgRow != null) + textId = trgRow.TextId; + else + throw new ArgumentNullException("Either a source or target must be specified."); + + object[] sourceRefs = srcRow != null ? new object[] { srcRow.Ref } : Array.Empty(); + object[] targetRefs = trgRow != null ? new object[] { trgRow.Ref } : Array.Empty(); + if (targetRefs.Length == 0 && TargetCorpus.IsScripture()) + { + targetRefs = sourceRefs + .Cast() + .Select(r => r.ChangeVersification(TargetCorpus.Versification)) + .Cast() + .ToArray(); + } + + TextRowFlags sourceFlags; + if (srcRow == null) + sourceFlags = forceSourceInRange ? TextRowFlags.InRange : TextRowFlags.None; + else + sourceFlags = srcRow.Flags; + + TextRowFlags targetFlags; + if (trgRow == null) + targetFlags = forceTargetInRange ? TextRowFlags.InRange : TextRowFlags.None; + else + targetFlags = trgRow.Flags; + + yield return new ParallelTextRow(textId, sourceRefs, targetRefs) + { + SourceSegment = srcRow != null ? srcRow.Segment : Array.Empty(), + TargetSegment = trgRow != null ? trgRow.Segment : Array.Empty(), + AlignedWordPairs = alignedWordPairs, + SourceFlags = sourceFlags, + TargetFlags = targetFlags + }; + } + + private bool CheckSameRefRows(List sameRefRows, TextRow otherRow) + { + try + { + if (sameRefRows.Count > 0 && RowRefComparer.Compare(sameRefRows[0].Ref, otherRow.Ref) != 0) + sameRefRows.Clear(); + } + catch (ArgumentException) + { + throw new CorpusAlignmentException(sameRefRows[0].Ref.ToString(), otherRow.Ref.ToString()); + } + return sameRefRows.Count > 0; + } + + private IEnumerable CreateSourceRows( + RangeInfo rangeInfo, + TextRow sourceRow, + List targetSameRefRows, + bool forceTargetInRange = false + ) + { + if (CheckSameRefRows(targetSameRefRows, sourceRow)) + { + foreach (TextRow targetSameRefRow in targetSameRefRows) + { + foreach (ParallelTextRow row in CreateRows(rangeInfo, sourceRow, targetSameRefRow)) + yield return row; + } + } + else if (AllSourceRows) + { + foreach ( + ParallelTextRow row in CreateRows( + rangeInfo, + sourceRow, + null, + forceTargetInRange: forceTargetInRange + ) + ) + { + yield return row; + } + } + } + + private IEnumerable CreateTargetRows( + RangeInfo rangeInfo, + TextRow targetRow, + List sourceSameRefRows, + bool forceSourceInRange = false + ) + { + if (CheckSameRefRows(sourceSameRefRows, targetRow)) + { + foreach (TextRow sourceSameRefRow in sourceSameRefRows) + { + foreach (ParallelTextRow row in CreateRows(rangeInfo, sourceSameRefRow, targetRow)) + yield return row; + } + } + else if (AllTargetRows) + { + foreach ( + ParallelTextRow row in CreateRows( + rangeInfo, + null, + targetRow, + forceSourceInRange: forceSourceInRange + ) + ) + { + yield return row; + } + } + } + + private class RangeRow + { + public List Refs { get; } = new List(); + public List Segment { get; } = new List(); + public bool IsSentenceStart { get; set; } = false; + public bool IsInRange => Refs.Count > 0; + public bool IsEmpty => Segment.Count == 0; + } + + private class NRangeInfo + { + public int N = -1; + public string TextId { get; set; } = ""; + public ScrVers Versification { get; set; } = null; + public List Rows { get; } = new List(); + public bool IsInRange => Rows.Any(r => r.IsInRange); + + public NParallelTextRow CreateRow() + { + object[] refs = new object[0]; + foreach (RangeRow cRow in Rows) + { + if (refs.Count() == 0 && Versification != null) + { + refs = cRow + .Refs.ToArray() + .Cast() + .Select(r => r.ChangeVersification(Versification)) + .Cast() + .ToArray(); + } + } + var nParRow = new NParallelTextRow(TextId, Rows.Select(r => r.Refs).ToArray()) + { + Segments = Rows.Select(r => r.Segment.ToArray()).ToArray(), + Flags = Rows.Select(r => r.IsSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None) + .ToArray() + }; + TextId = ""; + foreach (RangeRow r in Rows) + { + r.Refs.Clear(); + r.Segment.Clear(); + r.IsSentenceStart = false; + } + return nParRow; + } + } + + private class DefaultRowRefComparer : IComparer + { + public int Compare(object x, object y) + { + // Do not use the default comparer for ScriptureRef, since we want to ignore segments + if (x is ScriptureRef sx && y is ScriptureRef sy) + return sx.CompareTo(sy, compareSegments: false); + + return Comparer.Default.Compare(x, y); + } + } + } +} diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpusBase.cs b/src/SIL.Machine/Corpora/NParallelTextCorpusBase.cs new file mode 100644 index 000000000..5487b2001 --- /dev/null +++ b/src/SIL.Machine/Corpora/NParallelTextCorpusBase.cs @@ -0,0 +1,36 @@ +using System.Collections; +using System.Collections.Generic; +using System.Linq; + +namespace SIL.Machine.Corpora +{ + public abstract class NParallelTextCorpusBase : INParallelTextCorpus + { + int ICorpus.Count(bool includeEmpty) + { + return Count(includeEmpty, null); + } + + public virtual int Count(bool includeEmpty = true, IEnumerable textIds = null) + { + return includeEmpty ? GetRows(textIds).Count() : GetRows(textIds).Count(r => !r.IsEmpty); + } + + public IEnumerable GetRows() + { + return GetRows(null); + } + + public abstract IEnumerable GetRows(IEnumerable textIds); + + public IEnumerator GetEnumerator() + { + return GetRows().GetEnumerator(); + } + + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + } +} diff --git a/src/SIL.Machine/Corpora/NParallelTextRow.cs b/src/SIL.Machine/Corpora/NParallelTextRow.cs new file mode 100644 index 000000000..3035be330 --- /dev/null +++ b/src/SIL.Machine/Corpora/NParallelTextRow.cs @@ -0,0 +1,54 @@ +using System; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Linq; +using SIL.Extensions; + +namespace SIL.Machine.Corpora +{ + public class NParallelTextRow : IRow + { + public NParallelTextRow(string textId, IEnumerable> nRefs) + { + if (string.IsNullOrEmpty(textId)) + throw new ArgumentNullException(nameof(textId)); + + if (nRefs.SelectMany(r => r).Count() == 0) + throw new ArgumentNullException("Either a source or target ref must be provided."); + + TextId = textId; + NRefs = nRefs.ToList().ToReadOnlyList(); + N = NRefs.Count; + Segments = Enumerable.Range(0, N).Select(_ => Array.Empty()).ToImmutableArray(); + Flags = Enumerable.Range(0, N).Select(_ => TextRowFlags.SentenceStart).ToImmutableArray(); + } + + public string TextId { get; } + + public object Ref => NRefs.SelectMany(r => r).First(); + + public IReadOnlyList> NRefs { get; } + public int N { get; } + + public IReadOnlyList> Segments { get; set; } + public IReadOnlyList Flags { get; set; } + + public bool GetIsSentenceStart(int i) => + Flags.Count > i ? Flags[i].HasFlag(TextRowFlags.SentenceStart) : throw new ArgumentOutOfRangeException(); + + public bool GetIsInRange(int i) => + Flags.Count > i ? Flags[i].HasFlag(TextRowFlags.InRange) : throw new ArgumentOutOfRangeException(); + + public bool GetIsRangeStart(int i) => + Flags.Count > i ? Flags[i].HasFlag(TextRowFlags.RangeStart) : throw new ArgumentOutOfRangeException(); + + public bool IsEmpty => Segments.Any(s => s.Count == 0); + + public string GetText(int i) => string.Join(" ", Segments[i]); + + public NParallelTextRow Invert() + { + return new NParallelTextRow(TextId, NRefs.Reverse()) { Flags = Flags.Reverse().ToImmutableArray(), }; + } + } +} diff --git a/src/SIL.Machine/Corpora/ParallelCorpusEnumerator.cs b/src/SIL.Machine/Corpora/ParallelCorpusEnumerator.cs new file mode 100644 index 000000000..eed65e5a1 --- /dev/null +++ b/src/SIL.Machine/Corpora/ParallelCorpusEnumerator.cs @@ -0,0 +1,126 @@ +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using SIL.ObjectModel; +using SIL.Scripture; + +namespace SIL.Machine.Corpora +{ + public class ParallelCorpusEnumerator : DisposableBase, IEnumerator + { + private readonly IEnumerator _enumerator; + private readonly bool _isScripture = false; + private readonly Queue _verseRows; + private readonly ScrVers _refVersification; + private TextRow _current; + private bool _isEnumerating = false; + private bool _enumeratorHasMoreData = true; + + public ParallelCorpusEnumerator( + IEnumerator enumerator, + ScrVers refVersification, + ScrVers versification + ) + { + _enumerator = enumerator; + _refVersification = refVersification; + _isScripture = refVersification != null && versification != null && refVersification != versification; + _verseRows = new Queue(); + } + + public TextRow Current => _current; + + object IEnumerator.Current => Current; + + public bool MoveNext() + { + if (_isScripture) + { + if (!_isEnumerating) + { + _enumerator.MoveNext(); + _isEnumerating = true; + } + if (_verseRows.Count == 0 && _enumerator.Current != null && _enumeratorHasMoreData) + CollectVerses(); + if (_verseRows.Count > 0) + { + _current = _verseRows.Dequeue(); + return true; + } + _current = null; + return false; + } + + _enumeratorHasMoreData = _enumerator.MoveNext(); + _current = _enumerator.Current; + return _enumeratorHasMoreData; + } + + public void Reset() + { + _enumerator.Reset(); + _isEnumerating = false; + _enumeratorHasMoreData = true; + } + + protected override void DisposeManagedResources() + { + _enumerator.Dispose(); + } + + private void CollectVerses() + { + var rowList = new List<(ScriptureRef Ref, TextRow Row)>(); + bool outOfOrder = false; + ScriptureRef prevScrRef = ScriptureRef.Empty; + int rangeStartOffset = -1; + do + { + TextRow row = _enumerator.Current; + var scrRef = (ScriptureRef)row.Ref; + if (!prevScrRef.IsEmpty && scrRef.BookNum != prevScrRef.BookNum) + break; + + scrRef = scrRef.ChangeVersification(_refVersification); + // convert one-to-many versification mapping to a verse range + if (scrRef.Equals(prevScrRef)) + { + (ScriptureRef rangeStartVerseRef, TextRow rangeStartRow) = rowList[ + rowList.Count + rangeStartOffset + ]; + TextRowFlags flags = TextRowFlags.InRange; + if (rangeStartRow.IsSentenceStart) + flags |= TextRowFlags.SentenceStart; + if (rangeStartOffset == -1 && (!rangeStartRow.IsInRange || rangeStartRow.IsRangeStart)) + flags |= TextRowFlags.RangeStart; + rowList[rowList.Count + rangeStartOffset] = ( + rangeStartVerseRef, + new TextRow(rangeStartRow.TextId, rangeStartRow.Ref) + { + Segment = rangeStartRow.Segment.Concat(row.Segment).ToArray(), + Flags = flags + } + ); + row = new TextRow(row.TextId, row.Ref) { Flags = TextRowFlags.InRange }; + rangeStartOffset--; + } + else + { + rangeStartOffset = -1; + } + rowList.Add((scrRef, row)); + if (!outOfOrder && scrRef.CompareTo(prevScrRef) < 0) + outOfOrder = true; + prevScrRef = scrRef; + _enumeratorHasMoreData = _enumerator.MoveNext(); + } while (_enumeratorHasMoreData); + + if (outOfOrder) + rowList.Sort((x, y) => x.Ref.CompareTo(y.Ref)); + + foreach ((ScriptureRef _, TextRow row) in rowList) + _verseRows.Enqueue(row); + } + } +}