Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
nooraangelva committed Aug 4, 2022
1 parent c4e9d00 commit 7e364e2
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 3 deletions.
12 changes: 9 additions & 3 deletions inspirehep/modules/workflows/tasks/arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ def extract_authors_from_xml(xml_content):
undefined_or_empty_inspireid_value_regex = re.compile("undefined|inspire-\s*$", re.IGNORECASE)
undefined_value_regex = re.compile("undefined", re.IGNORECASE)
ror_path_value_regex = re.compile("https://ror.org/*")
remove_new_line_regex = re.compile(u"\s*\n\s*")

# Goes through all the authors in the file
for author in content.xpath("//Person"):
Expand All @@ -294,6 +295,8 @@ def extract_authors_from_xml(xml_content):

# Gets all the author ids
for source, id in itertools.izip(author.xpath('./authorIDs/authorID[@source!="" and text()!=""]/@source | ./authorids/authorid[@source!="" and text()!=""]/@source').getall(), author.xpath('./authorIDs/authorID[@source!="" and text()!=""]/text() | ./authorids/authorid[@source!="" and text()!=""]/text()').getall()):
source = re.sub(remove_new_line_regex, '', source)
id = re.sub(remove_new_line_regex, '', id)
if not re.match(undefined_value_regex, source) and not re.match(undefined_or_empty_inspireid_value_regex, id):
if source == u'CCID':
ids.append(['CERN', id])
Expand All @@ -304,12 +307,15 @@ def extract_authors_from_xml(xml_content):

# Gets all the names for affiliated organizations using the organization ids from author
for affiliation in author.xpath("./authorAffiliations/authorAffiliation/@organizationid").getall():
orgName = content.xpath(u'//organizations/Organization[@id="{}"]/orgName[@source="spiresICN" or @source="INSPIRE" and text()!="" ]/text()'.format(affiliation)).get()
if orgName and not re.match(undefined_or_none_value_regex, orgName):
affiliations.append(orgName)
orgName = str(content.xpath(u'//organizations/Organization[@id="{}"]/orgName[@source="spiresICN" or @source="INSPIRE" and text()!="" ]/text()'.format(affiliation)).get())
cleaned_org_name = re.sub(remove_new_line_regex, '', orgName)
if orgName and not re.match(undefined_or_none_value_regex, cleaned_org_name):
affiliations.append(cleaned_org_name)

# Gets all the affiliations_identifiers for affiliated organizations using the organization ids from author
for value, source in itertools.izip(content.xpath(u'//organizations/Organization[@id="{}"]/orgName[@source="ROR" or @source="GRID" and text()!=""]/text()'.format(affiliation)).getall(), content.xpath(u'//organizations/Organization[@id="{}"]/orgName[@source="ROR" or @source="GRID" and text()!=""]/@source'.format(affiliation)).getall()):
source = re.sub(remove_new_line_regex, '', source)
value = re.sub(remove_new_line_regex, '', value)
if re.match(undefined_or_none_value_regex, source) or re.match(undefined_or_none_value_regex, value):
continue

Expand Down
Binary file added tests/unit/workflows/fixtures/2207.10906.tar.gz
Binary file not shown.
50 changes: 50 additions & 0 deletions tests/unit/workflows/test_workflows_tasks_arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -1336,3 +1336,53 @@ def test_arxiv_author_no_none_in_ror():
validate(expected_author, authors_subschema)

assert expected_author[0] == obj.data['authors'][16]


def test_arxiv_handles_newLines():
schema = load_schema('hep')
eprints_subschema = schema['properties']['arxiv_eprints']
filename = pkg_resources.resource_filename(
__name__, os.path.join('fixtures', '2207.10906.tar.gz'))

data = {
'$schema': 'http://localhost:5000/hep.json',
'arxiv_eprints': [
{
'categories': [
'hep-ex',
],
'value': '2207.10906',
},
],
}
validate(data['arxiv_eprints'], eprints_subschema)

extra_data = {}
files = MockFiles({
'2207.10906.tar.gz': AttrDict({
'file': AttrDict({
'uri': filename,
})
})
})

obj = MockObj(data, extra_data, files=files)
eng = MockEng()

authors_subschema = schema['properties']['authors']
expected_author = [
{
'affiliations': [
{'value': u'Beijing, Inst. High Energy Phys.'},
],
'ids': [
{'value': u'INSPIRE-00059665', 'schema': u'INSPIRE ID'},
{'value': u'0000-0002-3935-619X', 'schema': u'ORCID'},
],
'full_name': u'Ablikim, Medina',
},
]
validate(expected_author, authors_subschema)

arxiv_author_list(obj, eng)
assert expected_author[0] == obj.data['authors'][0]

0 comments on commit 7e364e2

Please sign in to comment.