From ef1e47ae6361555a7f07f988cba14fa7528584ca Mon Sep 17 00:00:00 2001 From: Micha Moskovic Date: Mon, 20 Dec 2021 15:43:29 +0100 Subject: [PATCH] workflows: improve France match for HAL candidates * refs inspirehep/inspirehep#2331 --- inspirehep/modules/workflows/tasks/actions.py | 4 +-- .../unit/workflows/test_workflows_actions.py | 32 +++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/inspirehep/modules/workflows/tasks/actions.py b/inspirehep/modules/workflows/tasks/actions.py index 1cab347226..33fab807b4 100644 --- a/inspirehep/modules/workflows/tasks/actions.py +++ b/inspirehep/modules/workflows/tasks/actions.py @@ -1193,8 +1193,8 @@ def check_if_france_in_fulltext(obj, eng): fulltext = get_fulltext(obj) if not fulltext: return - fulltext_lower = fulltext.lower() - return 'france' in fulltext_lower or 'in2p3' in fulltext_lower + regex = re.compile(r"\bfrance\b|in2p3", re.UNICODE | re.IGNORECASE) + return regex.search(fulltext) def check_if_france_in_raw_affiliations(obj, eng): diff --git a/tests/unit/workflows/test_workflows_actions.py b/tests/unit/workflows/test_workflows_actions.py index 571f01053e..bbb3681909 100644 --- a/tests/unit/workflows/test_workflows_actions.py +++ b/tests/unit/workflows/test_workflows_actions.py @@ -517,6 +517,38 @@ def test_check_if_france_in_fulltext_when_france_in_header(mocked_get_document, assert france_in_fulltext +@patch("inspirehep.modules.workflows.tasks.actions.get_document_in_workflow") +def test_check_if_france_in_fulltext_doesnt_include_francesco(mocked_get_document, app): + fake_grobid_response = "Francesco, Papa" + + obj = MagicMock() + obj.data = { + 'authors': [ + {"full_name": "author 1"}, + {"full_name": "author 2"}, + {"full_name": "author 3"} + ] + } + + obj.extra_data = {} + eng = None + + new_config = {"GROBID_URL": "http://grobid_url.local"} + with patch.dict(current_app.config, new_config): + with requests_mock.Mocker() as requests_mocker: + requests_mocker.register_uri( + 'POST', 'http://grobid_url.local/api/processFulltextDocument', + text=fake_grobid_response, + headers={'content-type': 'application/xml'}, + status_code=200, + ) + with tempfile.NamedTemporaryFile() as tmp_file: + mocked_get_document.return_value.__enter__.return_value = tmp_file.name + france_in_fulltext = check_if_france_in_fulltext(obj, eng) + + assert not france_in_fulltext + + def test_check_if_france_in_affiliations(app): obj = MagicMock() obj.data = {