Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix incorrect handling of descriptive titles in USX
Browse files Browse the repository at this point in the history
ddaspit committed Nov 5, 2024
1 parent 3a9df2c commit 6d73765
Showing 4 changed files with 84 additions and 16 deletions.
2 changes: 2 additions & 0 deletions machine/corpora/__init__.py
Original file line number Diff line number Diff line change
@@ -65,6 +65,7 @@
from .usx_file_alignment_corpus import UsxFileAlignmentCorpus
from .usx_file_text import UsxFileText
from .usx_file_text_corpus import UsxFileTextCorpus
from .usx_memory_text import UsxMemoryText
from .usx_zip_text import UsxZipText
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase
@@ -150,6 +151,7 @@
"UsxFileAlignmentCorpus",
"UsxFileText",
"UsxFileTextCorpus",
"UsxMemoryText",
"UsxZipText",
"ZipParatextProjectSettingsParser",
"ZipParatextProjectSettingsParserBase",
15 changes: 15 additions & 0 deletions machine/corpora/usx_memory_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from typing import Optional

from ..scripture.verse_ref import Versification
from .memory_stream_container import MemoryStreamContainer
from .stream_container import StreamContainer
from .usx_text_base import UsxTextBase


class UsxMemoryText(UsxTextBase):
def __init__(self, id: str, usx: str, versification: Optional[Versification] = None) -> None:
super().__init__(id, versification)
self._usx = usx

def _create_stream_container(self) -> StreamContainer:
return MemoryStreamContainer(self._usx)
54 changes: 38 additions & 16 deletions machine/corpora/usx_verse_parser.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from __future__ import annotations

import string
from dataclasses import dataclass, field
from typing import BinaryIO, Iterable, List, Optional
from xml.etree import ElementTree

from ..scripture.verse_ref import are_overlapping_verse_ranges
from ..utils.string_utils import has_sentence_ending, is_integer
from ..utils.string_utils import has_sentence_ending
from .corpora_utils import merge_verse_ranges
from .usx_token import UsxToken
from .usx_verse import UsxVerse
@@ -86,25 +87,46 @@ def _parse_element(self, elem: ElementTree.Element, ctxt: _ParseContext) -> Iter
ctxt.add_token(e.tail)


_NONVERSE_PARA_STYLES = {"ms", "mr", "s", "sr", "r", "d", "sp", "rem", "restore", "cl"}


def _is_numbered_style(style_prefix: str, style: str) -> bool:
return style.startswith(style_prefix) and is_integer(style[len(style_prefix) :])
_VERSE_PARA_STYLES = {
# Paragraphs
"p",
"m",
"po",
"pr",
"cls",
"pmo",
"pm",
"pmc",
"pmr",
"pi",
"pc",
"mi",
"nb",
# Poetry
"q",
"qc",
"qr",
"qm",
"qd",
"b",
"d",
# Lists
"lh",
"li",
"lf",
"lim",
# Deprecated
"ph",
"phi",
"ps",
"psi",
}


def _is_verse_para(para_elem: ElementTree.Element) -> bool:
style = para_elem.get("style", "")
if style in _NONVERSE_PARA_STYLES:
return False

if _is_numbered_style("ms", style):
return False

if _is_numbered_style("s", style):
return False

return True
style = style.rstrip(string.digits)
return style in _VERSE_PARA_STYLES


@dataclass
29 changes: 29 additions & 0 deletions tests/corpora/test_usx_memory_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from typing import List

from testutils.corpora_test_helpers import scripture_ref

from machine.corpora import ScriptureRef, TextRow, UsxMemoryText


def test_get_rows_descriptive_title() -> None:
rows = get_rows(
r"""<usx version="3.0">
<book code="MAT" style="id">- Test</book>
<chapter number="1" style="c" />
<para style="d">
<verse number="1" style="v" sid="MAT 1:1" />Descriptive title</para>
<para style="p">
The rest of verse one.<verse eid="MAT 1:1" />
<verse number="2" style="v" />This is verse two.</para>
</usx>
"""
)
assert len(rows) == 2

assert scripture_ref(rows[0]) == ScriptureRef.parse("MAT 1:1"), str.join(",", [str(tr.ref) for tr in rows])
assert rows[0].text == "Descriptive title", str.join(",", [tr.text for tr in rows])


def get_rows(usx: str) -> List[TextRow]:
text = UsxMemoryText("MAT", usx)
return list(text.get_rows())

0 comments on commit 6d73765

Please sign in to comment.