From c10e5b99a0d15cd19b4680629b7a111c2ef2f28d Mon Sep 17 00:00:00 2001 From: Dragon Dave McKee Date: Wed, 14 Feb 2024 12:03:49 +0000 Subject: [PATCH] Remove code that extracts patterns outside of function --- src/lambdas/update_rules_processor/index.py | 9 ++------- src/utils/validate_patterns.py | 5 ++++- validate_match_csv.py | 3 +-- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/lambdas/update_rules_processor/index.py b/src/lambdas/update_rules_processor/index.py index 2e32f23fa..e4d54f22b 100644 --- a/src/lambdas/update_rules_processor/index.py +++ b/src/lambdas/update_rules_processor/index.py @@ -1,6 +1,5 @@ #!env/bin/python -import json import logging import urllib.parse from io import StringIO @@ -70,15 +69,11 @@ def lambda_handler(event: S3Event, context: LambdaContext) -> None: csv_file = response["Body"].read().decode("utf-8") df = pd.read_csv(StringIO(csv_file)) + # used by determine_replacements_caselaw create_test_jsonl(source_bucket, df) - jsonl_key = "test_citation_patterns.jsonl" - - patterns_resp = s3.get_object(Bucket=source_bucket, Key=jsonl_key) - patterns = patterns_resp["Body"] - pattern_list = [json.loads(line) for line in patterns.iter_lines()] try: - test_manifest(df, pattern_list) + test_manifest(df) except AssertionError: LOGGER.error("Exception: Manifest test failed") raise diff --git a/src/utils/validate_patterns.py b/src/utils/validate_patterns.py index 5c6882717..030f5f5b2 100644 --- a/src/utils/validate_patterns.py +++ b/src/utils/validate_patterns.py @@ -1,8 +1,10 @@ +import json + import pandas as pd import spacy -def test_manifest(df: pd.DataFrame, patterns: list[str]) -> None: +def test_manifest(df: pd.DataFrame) -> None: """ Test for the rules manifest: given a dataframe of the CSV file, and the patterns (which are also derived directly from that CSV file), check that the number of @@ -13,6 +15,7 @@ def test_manifest(df: pd.DataFrame, patterns: list[str]) -> None: "en_core_web_sm", exclude=["tok2vec", "attribute_ruler", "lemmatizer", "ner"] ) nlp.max_length = 2500000 + patterns = [json.loads(s) for s in df["pattern"]] citation_ruler = nlp.add_pipe("entity_ruler") citation_ruler.add_patterns(patterns) diff --git a/validate_match_csv.py b/validate_match_csv.py index be5178d7c..23cc6a726 100644 --- a/validate_match_csv.py +++ b/validate_match_csv.py @@ -55,5 +55,4 @@ def get_patterns(csv_dict): if len(match) > 1: raise RuntimeError(f"{len(match)} matches for {item['match_example']!r}") - -test_manifest(df, patterns) +test_manifest(df)