Skip to content

Commit

Permalink
Reviewer-requested changes
Browse files Browse the repository at this point in the history
  • Loading branch information
Enkidu93 committed Nov 13, 2024
1 parent 7d47b9e commit e07cf64
Show file tree
Hide file tree
Showing 9 changed files with 160 additions and 171 deletions.
73 changes: 38 additions & 35 deletions src/SIL.Machine/Corpora/CorporaExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -245,16 +245,6 @@ public static IParallelTextCorpus AlignRows(
};
}

public static NParallelTextCorpus AlignMany(this ITextCorpus[] corpora, bool[] allRowsPerCorpus = null)
{
NParallelTextCorpus nParallelTextCorpus = new NParallelTextCorpus(corpora);
if (allRowsPerCorpus != null)
{
nParallelTextCorpus.AllRowsList = allRowsPerCorpus;
}
return nParallelTextCorpus;
}

public static (ITextCorpus, ITextCorpus, int, int) Split(
this ITextCorpus corpus,
double? percent = null,
Expand Down Expand Up @@ -377,16 +367,6 @@ public static ITextCorpus FilterTexts(this ITextCorpus corpus, IEnumerable<strin
return new FilterTextsTextCorpus(corpus, textIds);
}

public static ITextCorpus ChooseRandom(this NParallelTextCorpus corpus, int seed)
{
return new MergedCorpus(corpus, MergeRule.Random, seed);
}

public static ITextCorpus ChooseFirst(this NParallelTextCorpus corpus)
{
return new MergedCorpus(corpus, MergeRule.First, 0);
}

private class TransformTextCorpus : TextCorpusBase
{
private readonly ITextCorpus _corpus;
Expand Down Expand Up @@ -541,36 +521,59 @@ public override IEnumerable<TextRow> GetRows(IEnumerable<string> textIds)
}
}

#endregion

#region INParallelTextCorpus operations

public static INParallelTextCorpus AlignMany(
this IEnumerable<ITextCorpus> corpora,
IEnumerable<bool> allRowsPerCorpus = null
)
{
NParallelTextCorpus nParallelTextCorpus = new NParallelTextCorpus(corpora);
if (allRowsPerCorpus != null)
{
nParallelTextCorpus.AllRows = allRowsPerCorpus.ToArray();
}
return nParallelTextCorpus;
}

public static ITextCorpus ChooseRandom(this INParallelTextCorpus corpus, int seed)
{
return new MergedCorpus(corpus, MergeRule.Random, seed);
}

public static ITextCorpus ChooseFirst(this INParallelTextCorpus corpus)
{
return new MergedCorpus(corpus, MergeRule.First, 0);
}

private enum MergeRule
{
First = 1,
Random = 2
First,
Random
}

private class MergedCorpus : TextCorpusBase
{
private readonly NParallelTextCorpus _corpus;
private readonly INParallelTextCorpus _corpus;

private readonly MergeRule _mergeRule;

private readonly Random _random;

private readonly int _seed;

public MergedCorpus(NParallelTextCorpus nParallelTextCorpus, MergeRule mergeRule, int seed)
public MergedCorpus(INParallelTextCorpus nParallelTextCorpus, MergeRule mergeRule, int seed)
{
_corpus = nParallelTextCorpus;
_mergeRule = mergeRule;
_seed = seed;
_random = new Random(_seed);
_random = new Random(seed);
}

public override IEnumerable<IText> Texts => _corpus.Corpora.SelectMany(c => c.Texts);

public override bool IsTokenized =>
Enumerable.Range(0, _corpus.N).Select(i => _corpus.GetIsTokenized(i)).All(b => b);
public override bool IsTokenized => Enumerable.Range(0, _corpus.N).All(i => _corpus.IsTokenized(i));

public override ScrVers Versification => _corpus.N > 0 ? _corpus.Corpora.First().Versification : null;
public override ScrVers Versification => _corpus.N > 0 ? _corpus.Corpora[0].Versification : null;

public override IEnumerable<TextRow> GetRows(IEnumerable<string> textIds)
{
Expand All @@ -579,14 +582,14 @@ public override IEnumerable<TextRow> GetRows(IEnumerable<string> textIds)
{
IReadOnlyList<int> nonEmptyIndices = nRow
.NSegments.Select((s, i) => (s, i))
.Where(pair => pair.s.Count > 0 || nRow.GetIsInRange(pair.i))
.Where(pair => pair.s.Count > 0 || nRow.IsInRange(pair.i))
.Select(pair => pair.i)
.ToList();
IReadOnlyList<int> indices =
nonEmptyIndices.Count > 0 ? nonEmptyIndices : Enumerable.Range(0, nRow.N).ToList();
if (indexOfInRangeRow == -1)
{
indices = indices.Where(i => nRow.GetIsRangeStart(i) || !nRow.GetIsInRange(i)).ToList();
indices = indices.Where(i => nRow.IsRangeStart(i) || !nRow.IsInRange(i)).ToList();
}
if (indices.Count == 0)
continue;
Expand All @@ -601,11 +604,11 @@ public override IEnumerable<TextRow> GetRows(IEnumerable<string> textIds)
break;
}
indexOfSelectedRow = indexOfInRangeRow != -1 ? indexOfInRangeRow : indexOfSelectedRow;
if (!nRow.GetIsInRange(indexOfSelectedRow))
if (!nRow.IsInRange(indexOfSelectedRow))
{
indexOfInRangeRow = -1;
}
if (nRow.GetIsRangeStart(indexOfSelectedRow))
if (nRow.IsRangeStart(indexOfSelectedRow))
{
indexOfInRangeRow = indexOfSelectedRow;
}
Expand Down
4 changes: 4 additions & 0 deletions src/SIL.Machine/Corpora/INParallelTextCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ namespace SIL.Machine.Corpora
{
public interface INParallelTextCorpus : ICorpus<NParallelTextRow>
{
int N { get; }
IReadOnlyList<ITextCorpus> Corpora { get; }

bool IsTokenized(int i);
int Count(bool includeEmpty = true, IEnumerable<string> textIds = null);

IEnumerable<NParallelTextRow> GetRows(IEnumerable<string> textIds);
Expand Down
Loading

0 comments on commit e07cf64

Please sign in to comment.