From 54ae315cf94f22a100315021fd652da64d52a533 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 13 Nov 2024 18:15:36 -0500 Subject: [PATCH] Reviewer changes --- .../Corpora/NParallelTextCorpus.cs | 300 +++++++++--------- src/SIL.Machine/Corpora/ParallelTextCorpus.cs | 2 +- 2 files changed, 149 insertions(+), 153 deletions(-) diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index f5dfd119..bc5210c4 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -110,198 +110,194 @@ private IEnumerable GetRows( IEnumerator alignmentEnumerator ) { + var rangeInfo = new NRangeInfo(N) { - var rangeInfo = new NRangeInfo(N) - { - Versifications = Corpora.Select(c => c.Versification).ToArray(), - RowRefComparer = RowRefComparer - }; + Versifications = Corpora.Select(c => c.Versification).ToArray(), + RowRefComparer = RowRefComparer + }; - bool[] completed = enumerators.Select(e => !e.MoveNext()).ToArray(); + bool[] completed = enumerators.Select(e => !e.MoveNext()).ToArray(); - while (!completed.All(c => c)) + while (!completed.All(c => c)) + { + List minRefIndexes; + List currentRows = enumerators.Select(e => e.Current).ToList(); + try + { + minRefIndexes = MinRefIndexes( + currentRows + .Select( + (e, i) => + { + if (!completed[i]) + return e.Ref; + return null; + } + ) + .ToArray() + ) + .ToList(); + } + catch (ArgumentException) { - List minRefIndexes; - List currentRows = enumerators.Select(e => e.Current).ToList(); - try + throw new CorpusAlignmentException(currentRows.Select(e => e.Ref.ToString()).ToArray()); + } + TextRow[] currentIncompleteRows = currentRows.Where((r, i) => !completed[i]).ToArray(); + List nonMinRefIndexes = Enumerable.Range(0, N).Except(minRefIndexes).ToList(); + int numberOfRemainingRows = N - completed.Count(c => c); + if (minRefIndexes.Count < numberOfRemainingRows || minRefIndexes.Count(i => !completed[i]) == 1) + //then there are some non-min refs or only one incomplete enumerator + { + List> minEnumerators = minRefIndexes.Select(i => enumerators[i]).ToList(); + List> nonMinEnumerators = nonMinRefIndexes + .Select(i => enumerators[i]) + .ToList(); + + if ( + nonMinRefIndexes.Any(i => !AllRows[i]) + && minRefIndexes.Any(i => !completed[i] && currentRows[i].IsInRange) + ) { - minRefIndexes = MinRefIndexes( - currentRows - .Select( - (e, i) => - { - if (!completed[i]) - return e.Ref; - return null; - } + if ( + rangeInfo.IsInRange + && nonMinEnumerators.Any(e => + e.Current != null && e.Current.IsInRange && e.Current.Segment.Count > 0 + ) + ) + { + yield return rangeInfo.CreateRow(); + } + minRefIndexes.ForEach(i => rangeInfo.AddTextRow(enumerators[i].Current, i)); + nonMinRefIndexes.ForEach(i => rangeInfo.Rows[i].SameRefRows.Clear()); + } + else + { + bool anyNonMinEnumeratorsMidRange = nonMinRefIndexes.Any(i => + !completed[i] && !currentRows[i].IsRangeStart && currentRows[i].IsInRange + ); + foreach ( + NParallelTextRow row in CreateMinRefRows( + rangeInfo, + currentRows.ToArray(), + minRefIndexes.ToArray(), + nonMinRefIndexes.ToArray(), + forceInRange: minRefIndexes + .Select(i => + anyNonMinEnumeratorsMidRange + && nonMinRefIndexes.All(j => + !completed[j] && currentRows[j].TextId == currentRows[i].TextId + ) ) - .ToArray() + .ToList() ) - .ToList(); + ) + { + yield return row; + } } - catch (ArgumentException) + foreach (int i in minRefIndexes) { - throw new CorpusAlignmentException(currentRows.Select(e => e.Ref.ToString()).ToArray()); + rangeInfo.Rows[i].SameRefRows.Add(enumerators[i].Current); + completed[i] = !enumerators[i].MoveNext(); } - TextRow[] currentIncompleteRows = currentRows.Where((r, i) => !completed[i]).ToArray(); - List nonMinRefIndexes = Enumerable.Range(0, N).Except(minRefIndexes).ToList(); - int numberOfRemainingRows = N - completed.Count(c => c); - if (minRefIndexes.Count < numberOfRemainingRows || minRefIndexes.Count(i => !completed[i]) == 1) - //then there are some non-min refs or only one incomplete enumerator + } + else if (minRefIndexes.Count == numberOfRemainingRows) + // the refs are all the same + { + int compareAlignmentCorpus = -1; + if (AlignmentCorpus != null) { - List> minEnumerators = minRefIndexes.Select(i => enumerators[i]).ToList(); - List> nonMinEnumerators = nonMinRefIndexes - .Select(i => enumerators[i]) - .ToList(); - - if ( - nonMinRefIndexes.Any(i => !AllRows[i]) - && minRefIndexes.Any(i => !completed[i] && currentRows[i].IsInRange) - ) + do { - if ( - rangeInfo.IsInRange - && nonMinEnumerators.Any(e => - e.Current != null && e.Current.IsInRange && e.Current.Segment.Count > 0 - ) - ) + try { - yield return rangeInfo.CreateRow(); + compareAlignmentCorpus = alignmentEnumerator.MoveNext() + ? RowRefComparer.Compare( + currentIncompleteRows[0].Ref, + alignmentEnumerator.Current.Ref + ) + : 1; } - minRefIndexes.ForEach(i => rangeInfo.AddTextRow(enumerators[i].Current, i)); - nonMinRefIndexes.ForEach(i => rangeInfo.Rows[i].SameRefRows.Clear()); - } - else - { - bool anyNonMinEnumeratorsMidRange = nonMinRefIndexes.Any(i => - !completed[i] && !currentRows[i].IsRangeStart && currentRows[i].IsInRange - ); - foreach ( - NParallelTextRow row in CreateMinRefRows( - rangeInfo, - currentRows.ToArray(), - minRefIndexes.ToArray(), - nonMinRefIndexes.ToArray(), - forceInRange: minRefIndexes - .Select(i => - anyNonMinEnumeratorsMidRange - && nonMinRefIndexes.All(j => - !completed[j] && currentRows[j].TextId == currentRows[i].TextId - ) - ) - .ToList() - ) - ) + catch (ArgumentException) { - yield return row; + throw new CorpusAlignmentException(currentRows.Select(e => e.Ref.ToString()).ToArray()); } - } - foreach (int i in minRefIndexes) - { - rangeInfo.Rows[i].SameRefRows.Add(enumerators[i].Current); - completed[i] = !enumerators[i].MoveNext(); - } + } while (compareAlignmentCorpus < 0); } - else if (minRefIndexes.Count == numberOfRemainingRows) - // the refs are all the same + + if ( + minRefIndexes + .Select(i => + enumerators[i].Current.IsInRange && minRefIndexes.All(j => j == i || !AllRows[j]) + ) + .Any(b => b) + ) { - int compareAlignmentCorpus = -1; - if (AlignmentCorpus != null) + if (rangeInfo.IsInRange && AllInRangeHaveSegments(currentIncompleteRows)) { - do - { - try - { - compareAlignmentCorpus = alignmentEnumerator.MoveNext() - ? RowRefComparer.Compare( - currentIncompleteRows[0].Ref, - alignmentEnumerator.Current.Ref - ) - : 1; - } - catch (ArgumentException) - { - throw new CorpusAlignmentException( - currentRows.Select(e => e.Ref.ToString()).ToArray() - ); - } - } while (compareAlignmentCorpus < 0); + yield return rangeInfo.CreateRow(); } - if ( - minRefIndexes - .Select(i => - enumerators[i].Current.IsInRange && minRefIndexes.All(j => j == i || !AllRows[j]) - ) - .Any(b => b) - ) + for (int i = 0; i < rangeInfo.Rows.Count; i++) { - if (rangeInfo.IsInRange && AllInRangeHaveSegments(currentIncompleteRows)) - { - yield return rangeInfo.CreateRow(); - } - - for (int i = 0; i < rangeInfo.Rows.Count; i++) - { - rangeInfo.AddTextRow(currentRows[i], i); - rangeInfo.Rows[i].SameRefRows.Clear(); - } + rangeInfo.AddTextRow(currentRows[i], i); + rangeInfo.Rows[i].SameRefRows.Clear(); } - else + } + else + { + for (int i = 0; i < rangeInfo.Rows.Count; i++) { - for (int i = 0; i < rangeInfo.Rows.Count; i++) + for (int j = 0; j < rangeInfo.Rows.Count; j++) //TODO rework { - for (int j = 0; j < rangeInfo.Rows.Count; j++) //TODO rework - { - if (i == j || completed[i] || completed[j]) - continue; + if (i == j || completed[i] || completed[j]) + continue; - if (rangeInfo.CheckSameRefRows(rangeInfo.Rows[i].SameRefRows, currentRows[j])) + if (rangeInfo.CheckSameRefRows(rangeInfo.Rows[i].SameRefRows, currentRows[j])) + { + foreach (TextRow tr in rangeInfo.Rows[i].SameRefRows) { - foreach (TextRow tr in rangeInfo.Rows[i].SameRefRows) + var textRows = new TextRow[N]; + textRows[i] = tr; + textRows[j] = currentRows[j]; + foreach (NParallelTextRow r in CreateRows(rangeInfo, textRows)) { - var textRows = new TextRow[N]; - textRows[i] = tr; - textRows[j] = currentRows[j]; - foreach (NParallelTextRow r in CreateRows(rangeInfo, textRows)) - { - yield return r; - } + yield return r; } } } } - foreach ( - NParallelTextRow row in CreateRows( - rangeInfo, - currentRows.Select((r, i) => completed[i] ? null : r).ToArray(), - alignedWordPairs: AlignmentCorpus != null && compareAlignmentCorpus == 0 - ? alignmentEnumerator.Current.AlignedWordPairs.ToArray() - : null - ) - ) - { - yield return row; - } } - - for (int i = 0; i < rangeInfo.Rows.Count; i++) + foreach ( + NParallelTextRow row in CreateRows( + rangeInfo, + currentRows.Select((r, i) => completed[i] ? null : r).ToArray(), + alignedWordPairs: AlignmentCorpus != null && compareAlignmentCorpus == 0 + ? alignmentEnumerator.Current.AlignedWordPairs.ToArray() + : null + ) + ) { - rangeInfo.Rows[i].SameRefRows.Add(currentRows[i]); - completed[i] = !enumerators[i].MoveNext(); + yield return row; } } - else + + for (int i = 0; i < rangeInfo.Rows.Count; i++) { - throw new CorpusAlignmentException( - minRefIndexes.Select(i => currentRows[i].Ref.ToString()).ToArray() - ); + rangeInfo.Rows[i].SameRefRows.Add(currentRows[i]); + completed[i] = !enumerators[i].MoveNext(); } } - - if (rangeInfo.IsInRange) - yield return rangeInfo.CreateRow(); + else + { + throw new CorpusAlignmentException( + minRefIndexes.Select(i => currentRows[i].Ref.ToString()).ToArray() + ); + } } + + if (rangeInfo.IsInRange) + yield return rangeInfo.CreateRow(); } private object[] CorrectVersification(object[] refs, int i) diff --git a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs index e1b64281..f21acc7c 100644 --- a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs @@ -32,7 +32,7 @@ public ParallelTextCorpus( public IAlignmentCorpus AlignmentCorpus { get; } public IComparer RowRefComparer { get; } - private NParallelTextCorpus NParallelTextCorpus { get; set; } + public NParallelTextCorpus NParallelTextCorpus { get; } public override IEnumerable GetRows(IEnumerable textIds) {