Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Raise error when id tag doesn't match filename book id #141

Merged
merged 3 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions machine/corpora/usfm_text_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from io import TextIOWrapper
from typing import Generator, Iterable, List, Optional, Sequence

from ..scripture.canon import ALL_BOOK_IDS
from ..scripture.verse_ref import Versification
from ..utils.string_utils import has_sentence_ending
from .corpora_utils import gen
Expand Down Expand Up @@ -90,6 +91,13 @@ def __init__(self, text: UsfmTextBase) -> None:
def rows(self) -> Iterable[TextRow]:
return self._rows

def start_book(self, state: UsfmParserState, marker: str, code: str) -> None:
super().start_book(state, marker, code)
if code not in ALL_BOOK_IDS:
raise ValueError(f"The book {code} is not a valid book id.")
if code != self._text.id:
raise ValueError(f"The \\id marker {code} does not match the text id {self._text.id}.")

def verse(
self,
state: UsfmParserState,
Expand Down
14 changes: 13 additions & 1 deletion tests/corpora/test_scripture_text_corpus.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH
from pytest import raises
from testutils.corpora_test_helpers import USFM_MISMATCH_ID_PROJECT_PATH, USFM_TEST_PROJECT_PATH

from machine.corpora import ParatextTextCorpus, extract_scripture_corpus
from machine.scripture import ORIGINAL_VERSIFICATION, VerseRef
Expand Down Expand Up @@ -59,3 +60,14 @@ def test_extract_scripture_corpus() -> None:
assert text == ""
assert orig_vref.exact_equals(VerseRef.from_string("MAT 2:12", ORIGINAL_VERSIFICATION))
assert corpus_vref is not None and corpus_vref.exact_equals(VerseRef.from_string("MAT 2:12", corpus.versification))


def test_extract_scripture_corpus_mismatch_id() -> None:
corpus = ParatextTextCorpus(USFM_MISMATCH_ID_PROJECT_PATH, include_all_text=True)

with raises(
RuntimeError,
match=r"An error occurred while parsing the text 'JDG' in project mismatch_id. "
r"Verse: JUD 1:0, line: 1, character: 1, error: 'The \\id marker JUD does not match the text id JDG.'",
):
list(extract_scripture_corpus(corpus))
16 changes: 15 additions & 1 deletion tests/corpora/test_usfm_file_text.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from testutils.corpora_test_helpers import USFM_TEST_PROJECT_PATH, scripture_ref
from pytest import raises
from testutils.corpora_test_helpers import USFM_INVALID_ID_PROJECT_PATH, USFM_TEST_PROJECT_PATH, scripture_ref

from machine.corpora import ScriptureRef, UsfmFileTextCorpus

Expand Down Expand Up @@ -244,6 +245,19 @@ def test_get_rows_include_markers_all_text() -> None:
assert rows[26].text == "Here is some sidebar // content."


def test_get_rows_invalid_id() -> None:
corpus = UsfmFileTextCorpus(USFM_INVALID_ID_PROJECT_PATH)

text = corpus.get_text("JGS")
assert text is not None
with raises(
RuntimeError,
match="An error occurred while parsing the text 'JGS'."
" Verse: 1:0, line: 1, character: 1, error: 'The book JGS is not a valid book id.",
):
list(text)


def test_usfm_file_text_corpus_lowercase_usfm_id() -> None:
corpus = UsfmFileTextCorpus(USFM_TEST_PROJECT_PATH)

Expand Down
2 changes: 2 additions & 0 deletions tests/testutils/corpora_test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
USFM_TEST_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "Tes"
USFM_TARGET_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "target"
USFM_SOURCE_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "source"
USFM_INVALID_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "invalid_id"
USFM_MISMATCH_ID_PROJECT_PATH = TEST_DATA_PATH / "usfm" / "mismatch_id"
USX_TEST_PROJECT_PATH = TEST_DATA_PATH / "usx" / "Tes"
TEXT_TEST_PROJECT_PATH = TEST_DATA_PATH / "txt"
CUSTOM_VERS_PATH = TEST_DATA_PATH / "custom.vrs"
Expand Down
5 changes: 5 additions & 0 deletions tests/testutils/data/usfm/invalid_id/07JDG.SFM
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
\id JGS - Test
\h Judges
\mt Judges
\c 1
\v 1 Chapter one, verse one.
34 changes: 34 additions & 0 deletions tests/testutils/data/usfm/invalid_id/Settings.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<ScriptureText>
<StyleSheet>usfm.sty</StyleSheet>
<Versification>4</Versification>
<LanguageIsoCode>en:::</LanguageIsoCode>
<Language>English</Language>
<MinParatextVersion>8.0.100.76</MinParatextVersion>
<FullName>Test</FullName>
<Encoding>65001</Encoding>
<Editable>T</Editable>
<Copyright />
<NormalizationForm>NFC</NormalizationForm>
<Name>invalid_id</Name>
<Guid>a7e0b3ce0200736062f9f810a444dbfbe64aca35</Guid>
<DefaultFont>Charis SIL</DefaultFont>
<DefaultFontSize>12</DefaultFontSize>
<FontFeatures />
<HtmlLanguage />
<AssociatedLexicalProject />
<FileNameBookNameForm>41MAT</FileNameBookNameForm>
<FileNamePrePart />
<FileNamePostPart>.SFM</FileNamePostPart>
<BiblicalTermsListSetting>Major::BiblicalTerms.xml</BiblicalTermsListSetting>
<MatchBasedOnStems>F</MatchBasedOnStems>
<AllowReadAccess>F</AllowReadAccess>
<AllowSharingWithSLDR>F</AllowSharingWithSLDR>
<Visibility>Public</Visibility>
<TranslationInfo>Standard::</TranslationInfo>
<EncodingConverter />
<UsfmVersion>3</UsfmVersion>
<ParallelPassagesBooks>000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000</ParallelPassagesBooks>
<BooksPresent>000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000</BooksPresent>
<BibleModuleAssociations />
<Naming PrePart="" PostPart=".SFM" BookNameForm="41MAT" />
</ScriptureText>
31 changes: 31 additions & 0 deletions tests/testutils/data/usfm/invalid_id/custom.vrs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# custom.vrs

LEV 14:56
ROM 14:26
REV 12:17
TOB 5:22
TOB 10:12
SIR 23:28
ESG 1:22
ESG 3:15
ESG 5:14
ESG 8:17
ESG 10:14
SIR 33:33
SIR 41:24
BAR 1:22
4MA 7:25
4MA 12:20

# deliberately missing verses
-ROM 16:26
-ROM 16:27
-3JN 1:15
-S3Y 1:49
-ESG 4:6
-ESG 9:5
-ESG 9:30

LEV 14:55 = LEV 14:55
LEV 14:55 = LEV 14:56
LEV 14:56 = LEV 14:57
5 changes: 5 additions & 0 deletions tests/testutils/data/usfm/mismatch_id/07JDG.SFM
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
\id JUD - Test
\h Judges
\mt Judges
\c 1
\v 1 Chapter one, verse one.
34 changes: 34 additions & 0 deletions tests/testutils/data/usfm/mismatch_id/Settings.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<ScriptureText>
<StyleSheet>usfm.sty</StyleSheet>
<Versification>4</Versification>
<LanguageIsoCode>en:::</LanguageIsoCode>
<Language>English</Language>
<MinParatextVersion>8.0.100.76</MinParatextVersion>
<FullName>Test</FullName>
<Encoding>65001</Encoding>
<Editable>T</Editable>
<Copyright />
<NormalizationForm>NFC</NormalizationForm>
<Name>mismatch_id</Name>
<Guid>a7e0b3ce0200736062f9f810a444dbfbe64aca35</Guid>
<DefaultFont>Charis SIL</DefaultFont>
<DefaultFontSize>12</DefaultFontSize>
<FontFeatures />
<HtmlLanguage />
<AssociatedLexicalProject />
<FileNameBookNameForm>41MAT</FileNameBookNameForm>
<FileNamePrePart />
<FileNamePostPart>.SFM</FileNamePostPart>
<BiblicalTermsListSetting>Major::BiblicalTerms.xml</BiblicalTermsListSetting>
<MatchBasedOnStems>F</MatchBasedOnStems>
<AllowReadAccess>F</AllowReadAccess>
<AllowSharingWithSLDR>F</AllowSharingWithSLDR>
<Visibility>Public</Visibility>
<TranslationInfo>Standard::</TranslationInfo>
<EncodingConverter />
<UsfmVersion>3</UsfmVersion>
<ParallelPassagesBooks>000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000</ParallelPassagesBooks>
<BooksPresent>000000000000000000000000000000000000001100000000000000000000000000000000000000000000000000000000000000000000000000000000000</BooksPresent>
<BibleModuleAssociations />
<Naming PrePart="" PostPart=".SFM" BookNameForm="41MAT" />
</ScriptureText>
31 changes: 31 additions & 0 deletions tests/testutils/data/usfm/mismatch_id/custom.vrs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# custom.vrs

LEV 14:56
ROM 14:26
REV 12:17
TOB 5:22
TOB 10:12
SIR 23:28
ESG 1:22
ESG 3:15
ESG 5:14
ESG 8:17
ESG 10:14
SIR 33:33
SIR 41:24
BAR 1:22
4MA 7:25
4MA 12:20

# deliberately missing verses
-ROM 16:26
-ROM 16:27
-3JN 1:15
-S3Y 1:49
-ESG 4:6
-ESG 9:5
-ESG 9:30

LEV 14:55 = LEV 14:55
LEV 14:55 = LEV 14:56
LEV 14:56 = LEV 14:57
Loading