Skip to content

Commit

Permalink
the ideas down - more refining needed
Browse files Browse the repository at this point in the history
  • Loading branch information
johnml1135 committed Jan 16, 2025
1 parent c304a75 commit d1cc368
Show file tree
Hide file tree
Showing 12 changed files with 254 additions and 63 deletions.
10 changes: 10 additions & 0 deletions src/SIL.Machine/Corpora/IUsfmParserHandler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,16 @@ IReadOnlyList<UsfmAttribute> attributes
/// </summary>
void EndNote(UsfmParserState state, string marker, bool closed);

/// <summary>
/// Start of a note text
/// </summary>
void StartNoteText(UsfmParserState state, string marker);

/// <summary>
/// End of a note text
/// </summary>
void EndNoteText(UsfmParserState state, string marker);

/// <summary>
/// Start of a table
/// </summary>
Expand Down
12 changes: 10 additions & 2 deletions src/SIL.Machine/Corpora/ParatextProjectTextUpdaterBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ public string UpdateUsfm(
string bookId,
IReadOnlyList<(IReadOnlyList<ScriptureRef>, string)> rows,
string fullName = null,
UpdateUsfmBehavior behavior = UpdateUsfmBehavior.PreferExisting
UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferExisting,
UpdateUsfmIntraVerseMarkerBehavior noteBehavior = UpdateUsfmIntraVerseMarkerBehavior.Preserve,
UpdateUsfmIntraVerseMarkerBehavior formattingBehavior = UpdateUsfmIntraVerseMarkerBehavior.Strip
)
{
string fileName = _settings.GetBookFileName(bookId);
Expand All @@ -36,7 +38,13 @@ public string UpdateUsfm(
usfm = reader.ReadToEnd();
}

var handler = new UpdateUsfmParserHandler(rows, fullName is null ? null : $"- {fullName}", behavior);
var handler = new UpdateUsfmParserHandler(
rows,
fullName is null ? null : $"- {fullName}",
textBehavior,
noteBehavior,
formattingBehavior
);
try
{
UsfmParser.Parse(usfm, handler, _settings.Stylesheet, _settings.Versification);
Expand Down
19 changes: 16 additions & 3 deletions src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ public abstract class ScriptureRefUsfmParserHandlerBase : UsfmParserHandlerBase
private readonly Stack<ScriptureElement> _curElements;
private readonly Stack<ScriptureTextType> _curTextType;
private bool _duplicateVerse = false;
private bool _inNoteText = false;

protected ScriptureRefUsfmParserHandlerBase()
{
Expand Down Expand Up @@ -158,13 +159,12 @@ public override void StartNote(UsfmParserState state, string marker, string call
// if we hit a note in a verse paragraph and we aren't in a verse, then start a non-verse segment
CheckConvertVerseParaToNonVerse(state);
NextElement(marker);
StartNoteText(state);
}
}

public override void EndNote(UsfmParserState state, string marker, bool closed)
{
if (CurrentTextType == ScriptureTextType.Note && !_duplicateVerse)
if (_inNoteText && !_duplicateVerse)
EndNoteText(state);
}

Expand Down Expand Up @@ -192,6 +192,17 @@ IReadOnlyList<UsfmAttribute> attributes
CheckConvertVerseParaToNonVerse(state);
}

public override void EndChar(
UsfmParserState state,
string marker,
IReadOnlyList<UsfmAttribute> attributes,
bool closed
)
{
if (_inNoteText && !_duplicateVerse && UsfmStylesheet.IsNoteOrCrossReferencePart(marker))
EndNoteText(state);
}

protected virtual void StartVerseText(UsfmParserState state, IReadOnlyList<ScriptureRef> scriptureRefs) { }

protected virtual void EndVerseText(UsfmParserState state, IReadOnlyList<ScriptureRef> scriptureRefs) { }
Expand Down Expand Up @@ -231,16 +242,18 @@ private void EndNonVerseText(UsfmParserState state)
_curTextType.Pop();
}

private void StartNoteText(UsfmParserState state)
public void StartNoteText(UsfmParserState state)
{
_curTextType.Push(ScriptureTextType.Note);
StartNoteText(state, CreateNonVerseRef());
_inNoteText = true;
}

private void EndNoteText(UsfmParserState state)
{
EndNoteText(state, CreateNonVerseRef());
_curTextType.Pop();
_inNoteText = false;
}

private void UpdateVerseRef(VerseRef verseRef, string marker)
Expand Down
94 changes: 59 additions & 35 deletions src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs
Original file line number Diff line number Diff line change
@@ -1,16 +1,23 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;

namespace SIL.Machine.Corpora
{
public enum UpdateUsfmBehavior
public enum UpdateUsfmTextBehavior
{
PreferExisting,
PreferNew,
StripExisting
}

public enum UpdateUsfmIntraVerseMarkerBehavior
{
Preserve,
Strip,
}

/***
* This is a USFM parser handler that can be used to replace the existing text in a USFM file with the specified
* text.
Expand All @@ -21,23 +28,30 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase
private readonly List<UsfmToken> _tokens;
private readonly List<UsfmToken> _newTokens;
private readonly string _idText;
private readonly UpdateUsfmBehavior _behavior;
private readonly UpdateUsfmTextBehavior _textBehavior;
private readonly UpdateUsfmIntraVerseMarkerBehavior _noteBehavior;
private readonly UpdateUsfmIntraVerseMarkerBehavior _formattingBehavior;
private readonly Stack<bool> _replace;
private readonly Regex _nonAlpha = new Regex("[^a-zA-Z0-9]");
private int _rowIndex;
private int _tokenIndex;

public UpdateUsfmParserHandler(
IReadOnlyList<(IReadOnlyList<ScriptureRef>, string)> rows = null,
string idText = null,
UpdateUsfmBehavior behavior = UpdateUsfmBehavior.PreferExisting
UpdateUsfmTextBehavior textBehavior = UpdateUsfmTextBehavior.PreferExisting,
UpdateUsfmIntraVerseMarkerBehavior noteBehavior = UpdateUsfmIntraVerseMarkerBehavior.Preserve,
UpdateUsfmIntraVerseMarkerBehavior formattingBehavior = UpdateUsfmIntraVerseMarkerBehavior.Strip
)
{
_rows = rows ?? Array.Empty<(IReadOnlyList<ScriptureRef>, string)>();
_tokens = new List<UsfmToken>();
_newTokens = new List<UsfmToken>();
_idText = idText;
_replace = new Stack<bool>();
_behavior = behavior;
_textBehavior = textBehavior;
_noteBehavior = noteBehavior;
_formattingBehavior = formattingBehavior;
}

public IReadOnlyList<UsfmToken> Tokens => _tokens;
Expand Down Expand Up @@ -176,8 +190,10 @@ bool closed
)
{
// strip out char-style markers in verses that are being replaced
if (closed && ReplaceWithNewTokens(state))
if (ReplaceWithNewTokens(state, closed: closed, endCharacter: true))
SkipTokens(state);
else
CollectTokens(state);

base.EndChar(state, marker, attributes, closed);
}
Expand All @@ -196,8 +212,10 @@ public override void StartNote(UsfmParserState state, string marker, string call
public override void EndNote(UsfmParserState state, string marker, bool closed)
{
// strip out notes in verses that are being replaced
if (closed && ReplaceWithNewTokens(state))
if (ReplaceWithNewTokens(state, closed: closed, endNote: true))
SkipTokens(state);
else
CollectTokens(state);

base.EndNote(state, marker, closed);
}
Expand Down Expand Up @@ -271,25 +289,7 @@ protected override void EndNonVerseText(UsfmParserState state, ScriptureRef scri
protected override void StartNoteText(UsfmParserState state, ScriptureRef scriptureRef)
{
IReadOnlyList<string> rowTexts = AdvanceRows(new[] { scriptureRef });
var newTokens = new List<UsfmToken>();
if (rowTexts.Count > 0)
{
newTokens.Add(state.Token);
newTokens.Add(new UsfmToken(UsfmTokenType.Character, "ft", null, "ft*"));
for (int i = 0; i < rowTexts.Count; i++)
{
string text = rowTexts[i];
if (i < rowTexts.Count - 1)
text += " ";
newTokens.Add(new UsfmToken(text));
}
newTokens.Add(new UsfmToken(UsfmTokenType.End, state.Token.EndMarker, null, null));
PushNewTokens(newTokens);
}
else
{
PushTokensAsPrevious();
}
PushNewTokens(rowTexts.Select(t => new UsfmToken(t + " ")));
}

protected override void EndNoteText(UsfmParserState state, ScriptureRef scriptureRef)
Expand Down Expand Up @@ -362,9 +362,16 @@ private void SkipTokens(UsfmParserState state)
_tokenIndex = state.Index + 1 + state.SpecialTokenCount;
}

private bool ReplaceWithNewTokens(UsfmParserState state)
private bool ReplaceWithNewTokens(
UsfmParserState state,
bool closed = true,
bool endCharacter = false,
bool endNote = false
)
{
bool stripExistingText = _textBehavior == UpdateUsfmTextBehavior.StripExisting;
bool newText = _replace.Count > 0 && _replace.Peek();

int tokenEnd = state.Index + state.SpecialTokenCount;
bool existingText = false;
for (int index = _tokenIndex; index <= tokenEnd; index++)
Expand All @@ -376,15 +383,37 @@ private bool ReplaceWithNewTokens(UsfmParserState state)
}
}
bool useNewTokens =
_behavior == UpdateUsfmBehavior.StripExisting
stripExistingText
|| (newText && !existingText)
|| (newText && _behavior == UpdateUsfmBehavior.PreferNew);
|| (newText && _textBehavior == UpdateUsfmTextBehavior.PreferNew && !state.IsReferenceText);

if (useNewTokens)
if (useNewTokens && _newTokens.Count > 0)
_tokens.AddRange(_newTokens);

_newTokens.Clear();
return useNewTokens;

bool skipTokens = useNewTokens && closed;

bool withinNewText = _replace.Any(r => r);

if (withinNewText)
{
string bareMarker = _nonAlpha.Replace(state.Token.Marker ?? "", "");
if (state.Token.Type == UsfmTokenType.Character || endCharacter)
{
var behavior = UsfmStylesheet.IsNoteOrCrossReferencePart(bareMarker)
? _noteBehavior
: _formattingBehavior;
skipTokens = stripExistingText || behavior == UpdateUsfmIntraVerseMarkerBehavior.Strip;
}

if (state.NoteTag != null || endNote)
{
skipTokens = stripExistingText || _noteBehavior == UpdateUsfmIntraVerseMarkerBehavior.Strip;
}
}

return skipTokens;
}

private void PushNewTokens(IEnumerable<UsfmToken> tokens)
Expand All @@ -393,11 +422,6 @@ private void PushNewTokens(IEnumerable<UsfmToken> tokens)
_newTokens.AddRange(tokens);
}

private void PushTokensAsPrevious()
{
_replace.Push(_replace.Peek());
}

private void PopNewTokens()
{
_replace.Pop();
Expand Down
10 changes: 10 additions & 0 deletions src/SIL.Machine/Corpora/UsfmParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,11 @@ public bool ProcessToken()
Handler?.Ref(State, token.Marker, display, target);
break;
}
if (IsNoteTextStart(token) && State.NoteTag != null)
{
Handler?.StartNoteText(State, token.Marker);
break;
}

string actualMarker;
bool invalidMarker = false;
Expand Down Expand Up @@ -672,5 +677,10 @@ private bool IsRef(UsfmToken token)
&& (State.Tokens[State.Index + 2].Marker == token.EndMarker)
&& (token.Marker == "ref");
}

private bool IsNoteTextStart(UsfmToken token)
{
return token.Marker == "ft";
}
}
}
4 changes: 4 additions & 0 deletions src/SIL.Machine/Corpora/UsfmParserHandlerBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ public virtual void StartNote(UsfmParserState state, string marker, string calle

public virtual void EndNote(UsfmParserState state, string marker, bool closed) { }

public virtual void StartNoteText(UsfmParserState state, string marker) { }

public virtual void EndNoteText(UsfmParserState state, string marker) { }

public virtual void StartTable(UsfmParserState state) { }

public virtual void EndTable(UsfmParserState state) { }
Expand Down
7 changes: 3 additions & 4 deletions src/SIL.Machine/Corpora/UsfmParserState.cs
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,9 @@ public UsfmParserState(UsfmStylesheet stylesheet, ScrVers versification, IReadOn
/// </summary>
public int SpecialTokenCount { get; internal set; }

/// <summary>
/// True if the token processed is a figure.
/// </summary>
public bool IsFigure => CharTag?.Marker == "fig";
public bool IsReferenceText =>
!(CharTag is null)
&& (UsfmStylesheet.IsReference(CharTag.Marker) || UsfmStylesheet.IsFigure(CharTag.Marker));

/// <summary>
/// Current paragraph tag or null for none.
Expand Down
Loading

0 comments on commit d1cc368

Please sign in to comment.