Skip to content

Commit

Permalink
Merge pull request #331 from nationalarchives/fix/remove-only-uk-orig…
Browse files Browse the repository at this point in the history
…in-tna-ref-tags

Remove only uk:origin=tna ref tags
  • Loading branch information
jacksonj04 authored Dec 4, 2023
2 parents ef84512 + 1077468 commit 13121af
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 13 deletions.
48 changes: 36 additions & 12 deletions src/replacer/make_replacments.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,38 @@
import re
from typing import Tuple

import lxml.etree
from bs4 import BeautifulSoup

from replacer.replacer_pipeline import replacer_pipeline

LOGGER = logging.getLogger()
LOGGER.setLevel(logging.DEBUG)

TAG_REMOVE_XSLT = """
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:akn="http://docs.oasis-open.org/legaldocml/ns/akn/3.0"
xmlns:uk="https://caselaw.nationalarchives.gov.uk/akn">
<xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
<xsl:strip-space elements="*"/>
<!-- identity transform -->
<xsl:template match="@*|node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
<xsl:template match="akn:ref[@uk:origin='TNA']">
<xsl:apply-templates/>
</xsl:template>
<!-- <xsl:template match="reg"/> -->
</xsl:stylesheet>
"""


def make_post_header_replacements(
original_content: str, replacement_patterns: str
Expand Down Expand Up @@ -98,7 +123,7 @@ def detect_reference(text, etype):


def sanitize_judgment(file_content):
file_content = _remove_legislation_references(file_content)
file_content = _remove_old_enrichment_references(file_content)

soup = BeautifulSoup(file_content, "xml")

Expand All @@ -116,18 +141,17 @@ def _decompose_elements(soup, *element_kwargs):
element.decompose()


def _remove_legislation_references(file_content):
remove_from_judgment = []
legislation_references = detect_reference(file_content, "legislation")
for reference in legislation_references:
canonical_reference = reference[1]
opening = canonical_reference.split(">")[0] + ">"
remove_from_judgment.append((opening, ""))
remove_from_judgment.append(("</ref>", ""))
def _remove_old_enrichment_references(file_content):
"""
Enrichment creates <ref uk:origin="TNA"> tags; delete only these.
"""
root = lxml.etree.fromstring(file_content.encode("utf-8"))

transform = lxml.etree.XSLT(lxml.etree.XML(TAG_REMOVE_XSLT))

result = transform(root)

for k, v in remove_from_judgment:
file_content = file_content.replace(k, v)
return file_content
return lxml.etree.tostring(result).decode("utf-8")


def split_text_by_closing_header_tag(content: str) -> Tuple[str, str, str]:
Expand Down
54 changes: 53 additions & 1 deletion src/tests/replacer_tests/test_make_replacements.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,31 @@
from pathlib import Path

import lxml.etree
import pytest

from replacer.make_replacments import (
_remove_old_enrichment_references,
make_post_header_replacements,
split_text_by_closing_header_tag,
)

FIXTURE_DIR = Path(__file__).parent.parent.resolve() / "fixtures/"


def canonical_xml(text):
"""with thanks to https://stackoverflow.com/questions/52422385/python-3-xml-canonicalization"""
val = (
lxml.etree.tostring(lxml.etree.fromstring(text.encode("utf-8")), method="c14n2")
.replace(b"\n", b"")
.replace(b" ", b"")
)
return val


def assert_xml_same(a, b):
assert canonical_xml(a.strip()) == canonical_xml(b.strip())


class TestMakePostHeaderReplacements:
def test_make_post_header_replacements(self):
original_file_content = open(
Expand All @@ -27,7 +43,43 @@ def test_make_post_header_replacements(self):
content_with_replacements = make_post_header_replacements(
original_file_content, replacement_content
)
assert content_with_replacements == expected_file_content.strip()
assert canonical_xml(content_with_replacements) == canonical_xml(
expected_file_content.strip()
)

def test_post_header_works_if_already_enriched(self):
original_file_content = open(
f"{FIXTURE_DIR}/ewhc-ch-2023-257_enriched_stage_1.xml",
"r",
encoding="utf-8",
).read()
replacement_content = open(
f"{FIXTURE_DIR}/ewhc-ch-2023-257_replacements.txt", "r", encoding="utf-8"
).read()
expected_file_content = open(
f"{FIXTURE_DIR}/ewhc-ch-2023-257_enriched_stage_1.xml",
"r",
encoding="utf-8",
).read()

content_with_replacements = make_post_header_replacements(
original_file_content, replacement_content
)

assert_xml_same(content_with_replacements, expected_file_content)

def test_remove_legislation_references(self):
tidy_output = _remove_old_enrichment_references(
"""
<xml xmlns='http://docs.oasis-open.org/legaldocml/ns/akn/3.0' xmlns:uk="https://caselaw.nationalarchives.gov.uk/akn">
<a><e><ref uk:origin="TNA"><ref uk:origin="TNA"><b>AAA</b></ref><c/></ref>D</e></a>
</xml>"""
)

assert "<a><e><b>AAA</b><c/>D</e></a>" in tidy_output
assert "not-TNA" in _remove_old_enrichment_references(
'<akomaNtoso xmlns:uk="https://caselaw.nationalarchives.gov.uk/akn"><ref uk:origin="not-TNA"></ref></akomaNtoso>'
)


class TestSplitTextByClosingHeaderTag:
Expand Down

0 comments on commit 13121af

Please sign in to comment.