From bd0ec45a397657f690c8ab2e4037c1fbb572c006 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Fri, 8 Nov 2024 16:38:47 -0500 Subject: [PATCH] More fixes --- src/SIL.Machine/Corpora/CorporaExtensions.cs | 49 +- .../Corpora/NParallelTextCorpus.cs | 30 +- src/SIL.Machine/Corpora/NParallelTextRow.cs | 4 +- src/SIL.Machine/Corpora/ParallelTextCorpus.cs | 661 +----------------- .../Corpora/CorporaExtensionsTests.cs | 86 +++ 5 files changed, 154 insertions(+), 676 deletions(-) diff --git a/src/SIL.Machine/Corpora/CorporaExtensions.cs b/src/SIL.Machine/Corpora/CorporaExtensions.cs index 0c6c4228..a737d8c1 100644 --- a/src/SIL.Machine/Corpora/CorporaExtensions.cs +++ b/src/SIL.Machine/Corpora/CorporaExtensions.cs @@ -245,6 +245,16 @@ public static IParallelTextCorpus AlignRows( }; } + public static NParallelTextCorpus AlignMany(this ITextCorpus[] corpora, bool[] allRowsPerCorpus = null) + { + NParallelTextCorpus nParallelTextCorpus = new NParallelTextCorpus(corpora); + if (allRowsPerCorpus != null) + { + nParallelTextCorpus.AllRowsList = allRowsPerCorpus; + } + return nParallelTextCorpus; + } + public static (ITextCorpus, ITextCorpus, int, int) Split( this ITextCorpus corpus, double? percent = null, @@ -564,35 +574,46 @@ public MergedCorpus(NParallelTextCorpus nParallelTextCorpus, MergeRule mergeRule public override IEnumerable GetRows(IEnumerable textIds) { + int indexOfInRangeRow = -1; foreach (NParallelTextRow nRow in _corpus.GetRows(textIds)) { - if (nRow.N == 0 || nRow.IsEmpty) - continue; IReadOnlyList nonEmptyIndices = nRow .NSegments.Select((s, i) => (s, i)) - .Where(pair => pair.s.Count > 0) + .Where(pair => pair.s.Count > 0 || nRow.GetIsInRange(pair.i)) .Select(pair => pair.i) .ToList(); IReadOnlyList indices = nonEmptyIndices.Count > 0 ? nonEmptyIndices : Enumerable.Range(0, nRow.N).ToList(); + if (indexOfInRangeRow == -1) + { + indices = indices.Where(i => nRow.GetIsRangeStart(i) || !nRow.GetIsInRange(i)).ToList(); + } + if (indices.Count == 0) + continue; + int indexOfSelectedRow = -1; switch (_mergeRule) { case MergeRule.First: - yield return new TextRow(nRow.TextId, nRow.NRefs[indices.First()]) - { - Segment = nRow.NSegments[indices.First()], - Flags = nRow.NFlags[indices.First()] - }; + indexOfSelectedRow = indices.First(); break; case MergeRule.Random: - int i = _random.Next(0, indices.Count); - yield return new TextRow(nRow.TextId, nRow.NRefs[i]) - { - Segment = nRow.NSegments[i], - Flags = nRow.NFlags[i] - }; + indexOfSelectedRow = indices[_random.Next(0, indices.Count)]; break; } + indexOfSelectedRow = indexOfInRangeRow != -1 ? indexOfInRangeRow : indexOfSelectedRow; + if (!nRow.GetIsInRange(indexOfSelectedRow)) + { + indexOfInRangeRow = -1; + } + if (nRow.GetIsRangeStart(indexOfSelectedRow)) + { + indexOfInRangeRow = indexOfSelectedRow; + } + yield return new TextRow(nRow.TextId, nRow.Ref) + { + Segment = nRow.NSegments[indexOfSelectedRow], + Flags = nRow.NFlags[indexOfSelectedRow] + }; } } } diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index ad5fc73a..b9da9797 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -155,7 +155,10 @@ IEnumerator alignmentEnumerator var currentIncompleteRows = currentRows.Where((r, i) => !completed[i]).ToArray(); IList nonMinRefIndexes = System.Linq.Enumerable.Range(0, N).Except(minRefIndexes).ToList(); - if (minRefIndexes.Count < (N - completed.Count(c => c)) || completed.Where(c => !c).Count() == 1) //then there are some non-min refs or only one incomplete enumerator + if ( + minRefIndexes.Count < (N - completed.Count(c => c)) + || completed.Where((c, i) => !c && minRefIndexes.Contains(i)).Count() == 1 + ) //then there are some non-min refs or only one incomplete enumerator { IList> minEnumerators = minRefIndexes .Select(i => listOfEnumerators[i]) @@ -285,7 +288,7 @@ NParallelTextRow row in CreateMinRefRows( foreach ( NParallelTextRow row in CreateRows( rangeInfo, - currentIncompleteRows, + currentRows.Select((r, i) => completed[i] ? null : r).ToArray(), alignedWordPairs: AlignmentCorpus != null && compareAlignmentCorpus == 0 ? alignmentEnumerator.Current.AlignedWordPairs.ToArray() : null @@ -338,29 +341,30 @@ private IEnumerable CreateRows( if (rows.All(r => r == null)) throw new ArgumentNullException("A corpus row must be specified."); - object[] refRefs = new object[] { rows.Select(r => r?.Ref).First() }; + object[] defaultRefs = new object[] { }; + if (rows.Any(r => r != null)) + defaultRefs = new object[] { rows.Where(r => r != null).Select(r => r.Ref).First() }; string textId = null; - IList refs = new List(); - IList flags = new List(); + object[][] refs = new object[N][]; + TextRowFlags[] flags = new TextRowFlags[N]; for (int i = 0; i < rows.Count; i++) { if (rows[i] != null) { textId = textId ?? rows[i]?.TextId; - refs.Add( - CorrectVersification(rows[i].Ref == null ? new object[] { } : new object[] { rows[i].Ref }, i) - ); - flags.Add(rows[i].Flags); + refs[i] = CorrectVersification(rows[i].Ref == null ? defaultRefs : new object[] { rows[i].Ref }, i); + flags[i] = rows[i].Flags; } else { if (Corpora[i].IsScripture()) - refs.Add(CorrectVersification(refRefs, i)); + refs[i] = CorrectVersification(defaultRefs, i); else - refs.Add(new object[] { }); - flags.Add(forceInRange != null && forceInRange[i] ? TextRowFlags.InRange : TextRowFlags.None); + refs[i] = new object[] { }; + flags[i] = forceInRange != null && forceInRange[i] ? TextRowFlags.InRange : TextRowFlags.None; } } + refs = refs.Select(r => r ?? (new object[] { })).ToArray(); yield return new NParallelTextRow(textId, refs) { @@ -524,7 +528,7 @@ public NParallelTextRow CreateRow() } } - private class DefaultRowRefComparer : IComparer + public class DefaultRowRefComparer : IComparer { public int Compare(object x, object y) { diff --git a/src/SIL.Machine/Corpora/NParallelTextRow.cs b/src/SIL.Machine/Corpora/NParallelTextRow.cs index cc04b52e..fd60d8d9 100644 --- a/src/SIL.Machine/Corpora/NParallelTextRow.cs +++ b/src/SIL.Machine/Corpora/NParallelTextRow.cs @@ -13,8 +13,8 @@ public NParallelTextRow(string textId, IEnumerable> nRefs) if (string.IsNullOrEmpty(textId)) throw new ArgumentNullException(nameof(textId)); - if (nRefs.SelectMany(r => r).Count() == 0) - throw new ArgumentNullException("Either a source or target ref must be provided."); + if (nRefs == null || nRefs.Where(r => r != null).SelectMany(r => r).Count() == 0) + throw new ArgumentNullException($"Refs must be provided but nRefs={nRefs}"); TextId = textId; NRefs = nRefs.ToList().ToReadOnlyList(); diff --git a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs index e015dc47..8e79b545 100644 --- a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs @@ -1,9 +1,4 @@ -using System; -using System.Collections; -using System.Collections.Generic; -using System.Linq; -using SIL.ObjectModel; -using SIL.Scripture; +using System.Collections.Generic; namespace SIL.Machine.Corpora { @@ -19,7 +14,7 @@ public ParallelTextCorpus( SourceCorpus = sourceCorpus; TargetCorpus = targetCorpus; AlignmentCorpus = alignmentCorpus ?? new DictionaryAlignmentCorpus(); - RowRefComparer = rowRefComparer ?? new DefaultRowRefComparer(); + RowRefComparer = rowRefComparer ?? new NParallelTextCorpus.DefaultRowRefComparer(); NParallelTextCorpus = new NParallelTextCorpus(new List { SourceCorpus, TargetCorpus }) { AlignmentCorpus = AlignmentCorpus @@ -41,650 +36,22 @@ public ParallelTextCorpus( public override IEnumerable GetRows(IEnumerable textIds) { - if (2 > RowRefComparer.Compare(0, 0)) + NParallelTextCorpus.AllRowsList = new bool[] { AllSourceRows, AllTargetRows }; + bool isScripture = SourceCorpus.IsScripture() && TargetCorpus.IsScripture(); + foreach (var nRow in NParallelTextCorpus.GetRows(textIds)) { - //TODO rework - just for testing - NParallelTextCorpus.AllRowsList = new bool[] { AllSourceRows, AllTargetRows }; - - foreach (var nRow in NParallelTextCorpus.GetRows(textIds)) - { - yield return new ParallelTextRow(nRow.TextId, nRow.NRefs[0], nRow.NRefs[1]) - { - SourceFlags = nRow.NFlags[0], - TargetFlags = nRow.NFlags[1], - SourceSegment = nRow.NSegments[0], - TargetSegment = nRow.NSegments[1], - AlignedWordPairs = nRow.AlignedWordPairs - }; - } - } - else - { - IEnumerable sourceTextIds = SourceCorpus.Texts.Select(t => t.Id); - IEnumerable targetTextIds = TargetCorpus.Texts.Select(t => t.Id); - - HashSet filterTextIds; - if (AllSourceRows && AllTargetRows) - { - filterTextIds = new HashSet(sourceTextIds); - filterTextIds.UnionWith(targetTextIds); - } - else if (!AllSourceRows && !AllTargetRows) - { - filterTextIds = new HashSet(sourceTextIds); - filterTextIds.IntersectWith(targetTextIds); - } - else if (AllSourceRows) - { - filterTextIds = new HashSet(sourceTextIds); - } - else - { - filterTextIds = new HashSet(targetTextIds); - } - - if (textIds != null) - filterTextIds.IntersectWith(textIds); - - using (IEnumerator srcEnumerator = SourceCorpus.GetRows(filterTextIds).GetEnumerator()) - using ( - var trgEnumerator = new TargetCorpusEnumerator( - TargetCorpus.GetRows(filterTextIds).GetEnumerator(), - SourceCorpus.Versification, - TargetCorpus.Versification - ) + yield return new ParallelTextRow( + nRow.TextId, + nRow.NRefs[0].Count > 0 || !isScripture ? nRow.NRefs[0] : new object[] { nRow.Ref }, + nRow.NRefs[1].Count > 0 || !isScripture ? nRow.NRefs[1] : new object[] { nRow.Ref } ) - using ( - IEnumerator alignmentEnumerator = AlignmentCorpus - .GetRows(filterTextIds) - .GetEnumerator() - ) - { - var rangeInfo = new RangeInfo { TargetVersification = TargetCorpus.Versification }; - var sourceSameRefRows = new List(); - var targetSameRefRows = new List(); - - bool srcCompleted = !srcEnumerator.MoveNext(); - bool trgCompleted = !trgEnumerator.MoveNext(); - while (!srcCompleted && !trgCompleted) - { - int compare1 = 0; - try - { - compare1 = RowRefComparer.Compare(srcEnumerator.Current.Ref, trgEnumerator.Current.Ref); - } - catch (ArgumentException) - { - throw new CorpusAlignmentException( - srcEnumerator.Current.Ref.ToString(), - trgEnumerator.Current.Ref.ToString() - ); - } - if (compare1 < 0) - { - // source is less than target - if (!AllTargetRows && srcEnumerator.Current.IsInRange) - { - if ( - rangeInfo.IsInRange - && trgEnumerator.Current.IsInRange - && trgEnumerator.Current.Segment.Count > 0 - ) - { - yield return rangeInfo.CreateRow(); - } - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - } - else - { - foreach ( - ParallelTextRow row in CreateSourceRows( - rangeInfo, - srcEnumerator.Current, - targetSameRefRows, - forceTargetInRange: srcEnumerator.Current.TextId == trgEnumerator.Current.TextId - && !trgEnumerator.Current.IsRangeStart - && trgEnumerator.Current.IsInRange - ) - ) - { - yield return row; - } - } - - sourceSameRefRows.Add(srcEnumerator.Current); - srcCompleted = !srcEnumerator.MoveNext(); - } - else if (compare1 > 0) - { - if (!AllSourceRows && trgEnumerator.Current.IsInRange) - { - if ( - rangeInfo.IsInRange - && srcEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - ) - { - yield return rangeInfo.CreateRow(); - } - rangeInfo.TextId = trgEnumerator.Current.TextId; - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); - } - else - { - foreach ( - ParallelTextRow row in CreateTargetRows( - rangeInfo, - trgEnumerator.Current, - sourceSameRefRows, - forceSourceInRange: trgEnumerator.Current.TextId == srcEnumerator.Current.TextId - && !srcEnumerator.Current.IsRangeStart - && srcEnumerator.Current.IsInRange - ) - ) - { - yield return row; - } - } - - targetSameRefRows.Add(trgEnumerator.Current); - trgCompleted = !trgEnumerator.MoveNext(); - } - else - { - int compare2; - do - { - try - { - compare2 = alignmentEnumerator.MoveNext() - ? RowRefComparer.Compare( - srcEnumerator.Current.Ref, - alignmentEnumerator.Current.Ref - ) - : 1; - } - catch (ArgumentException) - { - throw new CorpusAlignmentException( - srcEnumerator.Current.Ref.ToString(), - trgEnumerator.Current.Ref.ToString() - ); - } - } while (compare2 < 0); - - if ( - (!AllTargetRows && srcEnumerator.Current.IsInRange) - || (!AllSourceRows && trgEnumerator.Current.IsInRange) - ) - { - if ( - rangeInfo.IsInRange - && ( - ( - srcEnumerator.Current.IsInRange - && !trgEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - ) - || ( - !srcEnumerator.Current.IsInRange - && trgEnumerator.Current.IsInRange - && trgEnumerator.Current.Segment.Count > 0 - ) - || ( - srcEnumerator.Current.IsInRange - && trgEnumerator.Current.IsInRange - && srcEnumerator.Current.Segment.Count > 0 - && trgEnumerator.Current.Segment.Count > 0 - ) - ) - ) - { - yield return rangeInfo.CreateRow(); - } - - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); - } - else - { - if (CheckSameRefRows(sourceSameRefRows, trgEnumerator.Current)) - { - foreach (TextRow prevSourceRow in sourceSameRefRows) - { - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - prevSourceRow, - trgEnumerator.Current - ) - ) - { - yield return row; - } - } - } - - if (CheckSameRefRows(targetSameRefRows, srcEnumerator.Current)) - { - foreach (TextRow prevTargetRow in targetSameRefRows) - { - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - srcEnumerator.Current, - prevTargetRow - ) - ) - { - yield return row; - } - } - } - - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - srcEnumerator.Current, - trgEnumerator.Current, - compare2 == 0 ? alignmentEnumerator.Current.AlignedWordPairs : null - ) - ) - { - yield return row; - } - } - - sourceSameRefRows.Add(srcEnumerator.Current); - srcCompleted = !srcEnumerator.MoveNext(); - - targetSameRefRows.Add(trgEnumerator.Current); - trgCompleted = !trgEnumerator.MoveNext(); - } - } - - while (!srcCompleted) - { - if (!AllTargetRows && srcEnumerator.Current.IsInRange) - { - rangeInfo.TextId = srcEnumerator.Current.TextId; - rangeInfo.SourceRefs.Add(srcEnumerator.Current.Ref); - targetSameRefRows.Clear(); - if (rangeInfo.IsSourceEmpty) - rangeInfo.IsSourceSentenceStart = srcEnumerator.Current.IsSentenceStart; - rangeInfo.SourceSegment.AddRange(srcEnumerator.Current.Segment); - } - else - { - foreach ( - ParallelTextRow row in CreateSourceRows( - rangeInfo, - srcEnumerator.Current, - targetSameRefRows - ) - ) - { - yield return row; - } - } - srcCompleted = !srcEnumerator.MoveNext(); - } - - while (!trgCompleted) - { - if (!AllSourceRows && trgEnumerator.Current.IsInRange) - { - rangeInfo.TextId = trgEnumerator.Current.TextId; - rangeInfo.TargetRefs.Add(trgEnumerator.Current.Ref); - sourceSameRefRows.Clear(); - if (rangeInfo.IsTargetEmpty) - rangeInfo.IsTargetSentenceStart = trgEnumerator.Current.IsSentenceStart; - rangeInfo.TargetSegment.AddRange(trgEnumerator.Current.Segment); - } - else - { - foreach ( - ParallelTextRow row in CreateTargetRows( - rangeInfo, - trgEnumerator.Current, - sourceSameRefRows - ) - ) - { - yield return row; - } - } - trgCompleted = !trgEnumerator.MoveNext(); - } - - if (rangeInfo.IsInRange) - yield return rangeInfo.CreateRow(); - } - } - } - - private IEnumerable CreateRows( - RangeInfo rangeInfo, - TextRow srcRow, - TextRow trgRow, - IReadOnlyCollection alignedWordPairs = null, - bool forceSourceInRange = false, - bool forceTargetInRange = false - ) - { - if (rangeInfo.IsInRange) - yield return rangeInfo.CreateRow(); - - string textId; - if (srcRow != null) - textId = srcRow.TextId; - else if (trgRow != null) - textId = trgRow.TextId; - else - throw new ArgumentNullException("Either a source or target must be specified."); - - object[] sourceRefs = srcRow != null ? new object[] { srcRow.Ref } : Array.Empty(); - object[] targetRefs = trgRow != null ? new object[] { trgRow.Ref } : Array.Empty(); - if (targetRefs.Length == 0 && TargetCorpus.IsScripture()) - { - targetRefs = sourceRefs - .Cast() - .Select(r => r.ChangeVersification(TargetCorpus.Versification)) - .Cast() - .ToArray(); - } - - TextRowFlags sourceFlags; - if (srcRow == null) - sourceFlags = forceSourceInRange ? TextRowFlags.InRange : TextRowFlags.None; - else - sourceFlags = srcRow.Flags; - - TextRowFlags targetFlags; - if (trgRow == null) - targetFlags = forceTargetInRange ? TextRowFlags.InRange : TextRowFlags.None; - else - targetFlags = trgRow.Flags; - - yield return new ParallelTextRow(textId, sourceRefs, targetRefs) - { - SourceSegment = srcRow != null ? srcRow.Segment : Array.Empty(), - TargetSegment = trgRow != null ? trgRow.Segment : Array.Empty(), - AlignedWordPairs = alignedWordPairs, - SourceFlags = sourceFlags, - TargetFlags = targetFlags - }; - } - - private bool CheckSameRefRows(List sameRefRows, TextRow otherRow) - { - try - { - if (sameRefRows.Count > 0 && RowRefComparer.Compare(sameRefRows[0].Ref, otherRow.Ref) != 0) - sameRefRows.Clear(); - } - catch (ArgumentException) - { - throw new CorpusAlignmentException(sameRefRows[0].Ref.ToString(), otherRow.Ref.ToString()); - } - return sameRefRows.Count > 0; - } - - private IEnumerable CreateSourceRows( - RangeInfo rangeInfo, - TextRow sourceRow, - List targetSameRefRows, - bool forceTargetInRange = false - ) - { - if (CheckSameRefRows(targetSameRefRows, sourceRow)) - { - foreach (TextRow targetSameRefRow in targetSameRefRows) { - foreach (ParallelTextRow row in CreateRows(rangeInfo, sourceRow, targetSameRefRow)) - yield return row; - } - } - else if (AllSourceRows) - { - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - sourceRow, - null, - forceTargetInRange: forceTargetInRange - ) - ) - { - yield return row; - } - } - } - - private IEnumerable CreateTargetRows( - RangeInfo rangeInfo, - TextRow targetRow, - List sourceSameRefRows, - bool forceSourceInRange = false - ) - { - if (CheckSameRefRows(sourceSameRefRows, targetRow)) - { - foreach (TextRow sourceSameRefRow in sourceSameRefRows) - { - foreach (ParallelTextRow row in CreateRows(rangeInfo, sourceSameRefRow, targetRow)) - yield return row; - } - } - else if (AllTargetRows) - { - foreach ( - ParallelTextRow row in CreateRows( - rangeInfo, - null, - targetRow, - forceSourceInRange: forceSourceInRange - ) - ) - { - yield return row; - } - } - } - - private class RangeInfo - { - public string TextId { get; set; } = ""; - public List SourceRefs { get; } = new List(); - public List TargetRefs { get; } = new List(); - public List SourceSegment { get; } = new List(); - public List TargetSegment { get; } = new List(); - public bool IsSourceSentenceStart { get; set; } = false; - public bool IsTargetSentenceStart { get; set; } = false; - public bool IsInRange => SourceRefs.Count > 0 || TargetRefs.Count > 0; - public bool IsSourceEmpty => SourceSegment.Count == 0; - public bool IsTargetEmpty => TargetSegment.Count == 0; - - public ScrVers TargetVersification { get; set; } = null; - - public ParallelTextRow CreateRow() - { - object[] trgRefs = TargetRefs.ToArray(); - if (TargetRefs.Count == 0 && TargetVersification != null) - { - trgRefs = SourceRefs - .ToArray() - .Cast() - .Select(r => r.ChangeVersification(TargetVersification)) - .Cast() - .ToArray(); - } - var row = new ParallelTextRow(TextId, SourceRefs.ToArray(), trgRefs) - { - SourceSegment = SourceSegment.ToArray(), - TargetSegment = TargetSegment.ToArray(), - SourceFlags = IsSourceSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None, - TargetFlags = IsTargetSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None + SourceFlags = nRow.NFlags[0], + TargetFlags = nRow.NFlags[1], + SourceSegment = nRow.NSegments[0], + TargetSegment = nRow.NSegments[1], + AlignedWordPairs = nRow.AlignedWordPairs }; - TextId = ""; - SourceRefs.Clear(); - TargetRefs.Clear(); - SourceSegment.Clear(); - TargetSegment.Clear(); - IsSourceSentenceStart = false; - IsTargetSentenceStart = false; - return row; - } - } - - private class DefaultRowRefComparer : IComparer - { - public int Compare(object x, object y) - { - // Do not use the default comparer for ScriptureRef, since we want to ignore segments - if (x is ScriptureRef sx && y is ScriptureRef sy) - return sx.CompareTo(sy, compareSegments: false); - - return Comparer.Default.Compare(x, y); - } - } - - private class TargetCorpusEnumerator : DisposableBase, IEnumerator - { - private readonly IEnumerator _enumerator; - private readonly bool _isScripture = false; - private readonly Queue _verseRows; - private readonly ScrVers _sourceVersification; - private TextRow _current; - private bool _isEnumerating = false; - private bool _enumeratorHasMoreData = true; - - public TargetCorpusEnumerator( - IEnumerator enumerator, - ScrVers sourceVersification, - ScrVers targetVersification - ) - { - _enumerator = enumerator; - _sourceVersification = sourceVersification; - _isScripture = - sourceVersification != null - && targetVersification != null - && sourceVersification != targetVersification; - _verseRows = new Queue(); - } - - public TextRow Current => _current; - - object IEnumerator.Current => Current; - - public bool MoveNext() - { - if (_isScripture) - { - if (!_isEnumerating) - { - _enumerator.MoveNext(); - _isEnumerating = true; - } - if (_verseRows.Count == 0 && _enumerator.Current != null && _enumeratorHasMoreData) - CollectVerses(); - if (_verseRows.Count > 0) - { - _current = _verseRows.Dequeue(); - return true; - } - _current = null; - return false; - } - - _enumeratorHasMoreData = _enumerator.MoveNext(); - _current = _enumerator.Current; - return _enumeratorHasMoreData; - } - - public void Reset() - { - _enumerator.Reset(); - _isEnumerating = false; - _enumeratorHasMoreData = true; - } - - protected override void DisposeManagedResources() - { - _enumerator.Dispose(); - } - - private void CollectVerses() - { - var rowList = new List<(ScriptureRef Ref, TextRow Row)>(); - bool outOfOrder = false; - ScriptureRef prevScrRef = ScriptureRef.Empty; - int rangeStartOffset = -1; - do - { - TextRow row = _enumerator.Current; - var scrRef = (ScriptureRef)row.Ref; - if (!prevScrRef.IsEmpty && scrRef.BookNum != prevScrRef.BookNum) - break; - - scrRef = scrRef.ChangeVersification(_sourceVersification); - // convert one-to-many versification mapping to a verse range - if (scrRef.Equals(prevScrRef)) - { - (ScriptureRef rangeStartVerseRef, TextRow rangeStartRow) = rowList[ - rowList.Count + rangeStartOffset - ]; - TextRowFlags flags = TextRowFlags.InRange; - if (rangeStartRow.IsSentenceStart) - flags |= TextRowFlags.SentenceStart; - if (rangeStartOffset == -1 && (!rangeStartRow.IsInRange || rangeStartRow.IsRangeStart)) - flags |= TextRowFlags.RangeStart; - rowList[rowList.Count + rangeStartOffset] = ( - rangeStartVerseRef, - new TextRow(rangeStartRow.TextId, rangeStartRow.Ref) - { - Segment = rangeStartRow.Segment.Concat(row.Segment).ToArray(), - Flags = flags - } - ); - row = new TextRow(row.TextId, row.Ref) { Flags = TextRowFlags.InRange }; - rangeStartOffset--; - } - else - { - rangeStartOffset = -1; - } - rowList.Add((scrRef, row)); - if (!outOfOrder && scrRef.CompareTo(prevScrRef) < 0) - outOfOrder = true; - prevScrRef = scrRef; - _enumeratorHasMoreData = _enumerator.MoveNext(); - } while (_enumeratorHasMoreData); - - if (outOfOrder) - rowList.Sort((x, y) => x.Ref.CompareTo(y.Ref)); - - foreach ((ScriptureRef _, TextRow row) in rowList) - _verseRows.Enqueue(row); } } } diff --git a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs index d813aff4..5f397bc3 100644 --- a/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/CorporaExtensionsTests.cs @@ -199,6 +199,92 @@ public void MergedCorpus_SelectRandom_Seed4501() }); } + [Test] + public void AlignMergedCorpora() + { + var sourceCorpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 1 segment 1 ."), + TextRow("text1", 2, "source 1 segment 2 ."), + TextRow("text1", 3, "source 1 segment 3 .") + } + ) + ); + var sourceCorpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 2 segment 1 ."), + TextRow("text1", 2, "source 2 segment 2 ."), + TextRow("text1", 3, "source 2 segment 3 .") + } + ) + ); + var sourceCorpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "source 3 segment 1 ."), + TextRow("text1", 2, "source 3 segment 2 ."), + TextRow("text1", 3, "source 3 segment 3 .") + } + ) + ); + + ITextCorpus sourceCorpus = (new ITextCorpus[] { sourceCorpus1, sourceCorpus1, sourceCorpus3 }) + .AlignMany([true, true, true]) + .SelectFirst(); + + var targetCorpus1 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "target 1 segment 1 ."), + TextRow("text1", 2, "target 1 segment 2 ."), + TextRow("text1", 3, "target 1 segment 3 .") + } + ) + ); + var targetCorpus2 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "target 2 segment 1 ."), + TextRow("text1", 2, "target 2 segment 2 ."), + TextRow("text1", 3, "target 2 segment 3 .") + } + ) + ); + var targetCorpus3 = new DictionaryTextCorpus( + new MemoryText( + "text1", + new[] + { + TextRow("text1", 1, "target 3 segment 1 ."), + TextRow("text1", 2, "target 3 segment 2 ."), + TextRow("text1", 3, "target 3 segment 3 .") + } + ) + ); + + ITextCorpus targetCorpus = (new ITextCorpus[] { targetCorpus1, targetCorpus2, targetCorpus3 }) + .AlignMany([true, true, true]) + .SelectFirst(); + + IParallelTextCorpus alignedCorpus = sourceCorpus.AlignRows(targetCorpus); + ParallelTextRow[] rows = alignedCorpus.GetRows().ToArray(); + Assert.That(rows, Has.Length.EqualTo(3)); + Assert.That(rows[0].SourceText, Is.EqualTo("source 1 segment 1 .")); + Assert.That(rows[2].TargetText, Is.EqualTo("target 1 segment 3 .")); + } + private static TextRow TextRow( string textId, object rowRef,