Skip to content

Commit

Permalink
ARXIVCE-2997 versionless canonical URL for PDF and HTML articles
Browse files Browse the repository at this point in the history
  • Loading branch information
dginev committed Jan 7, 2025
1 parent 43f79bd commit 0ddb436
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 8 deletions.
13 changes: 6 additions & 7 deletions browse/controllers/files/dissemination.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,7 @@ def pdf_resp_fn(file: FileObj,
resp = default_resp_fn(file, arxiv_id, docmeta, version)
filename = f"{arxiv_id.filename}v{version.version}.pdf"
resp.headers["Content-Disposition"] = f"inline; filename=\"{filename}\""
if arxiv_id.has_version:
resp.headers["Link"] = f"<https://arxiv.org/pdf/{arxiv_id.idv}>; rel='canonical'"
else:
resp.headers["Link"] = f"<https://arxiv.org/pdf/{arxiv_id.id}>; rel='canonical'"
resp.headers["Link"] = f"<https://arxiv.org/pdf/{arxiv_id.id}>; rel='canonical'"
if arxiv_id.has_version:
resp.headers=add_surrogate_key(resp.headers,["pdf",f"pdf-{arxiv_id.idv}"])
else:
Expand Down Expand Up @@ -196,14 +193,14 @@ def _html_response(file_list: Union[List[FileObj],FileObj],
version: VersionEntry) -> Response:
if docmeta.source_format == 'html' or version.source_flag.html:
resp= _html_source_listing_response(file_list, arxiv_id)
elif isinstance(file_list, FileObj): #converted via latexml
elif issubclass(type(file_list), FileObj): #converted via latexml
resp= default_resp_fn(file_list, arxiv_id, docmeta, version)
resp.headers=add_surrogate_key(resp.headers,["html-latexml"])
else:
# Not a data error since a non-html-source paper might legitimately not have a latexml HTML
resp= unavailable(arxiv_id)

if arxiv_id.has_version:
if arxiv_id.has_version:
resp.headers=add_surrogate_key(resp.headers,["html",f"html-{arxiv_id.idv}"])
else:
resp.headers=add_surrogate_key(resp.headers,["html",f"html-{arxiv_id.id}-current"])
Expand All @@ -213,7 +210,9 @@ def _html_response(file_list: Union[List[FileObj],FileObj],
def _html_source_single_response(file: FileObj, arxiv_id: Identifier) -> Response:
"""Produces a `Response`for a single file for a paper with HTML source."""
if _is_html_name(file): # do post_processing
return default_resp_fn( FileTransform(file, post_process_html), arxiv_id)
resp = default_resp_fn( FileTransform(file, post_process_html), arxiv_id)
resp.headers["Link"] = f"<https://arxiv.org/html/{arxiv_id.id}>; rel='canonical'"
return resp
else:
return default_resp_fn( file, arxiv_id)

Expand Down
2 changes: 1 addition & 1 deletion tests/dissemination/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def test_pdf_headers(client_with_test_fs):
assert "pdf-cs/0011004v1" in head
assert "paper-id-cs/0011004" in head

assert rv.headers["Link"] == "<https://arxiv.org/pdf/cs/0011004v1>; rel='canonical'", "should not have version"
assert rv.headers["Link"] == "<https://arxiv.org/pdf/cs/0011004>; rel='canonical'", "should not have version"


def test_pdf_redirect(client_with_test_fs):
Expand Down
9 changes: 9 additions & 0 deletions tests/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ def test_html_paper(client_with_test_fs):
assert resp.status_code == 200
headers= resp.headers
assert "max-age=31536000" in resp.headers.get("Surrogate-Control")
assert resp.headers.get('Link','') == "<https://arxiv.org/html/2403.10561>; rel='canonical'", "versionless canonical header"

keys= " "+headers["Surrogate-Key"]+" "
expected_keys=["html", "paper-id-2403.10561", "paper-id-2403.10561-current", "html-native", "html-2403.10561-current"]
assert all(" "+item+" " in keys for item in expected_keys)
Expand All @@ -23,19 +25,24 @@ def test_html_paper(client_with_test_fs):

resp = client_with_test_fs.get("/html/2403.10561/")
assert resp.status_code == 200 and b"Human-Centric" in resp.data
assert resp.headers.get('Link','') == "<https://arxiv.org/html/2403.10561>; rel='canonical'", "versionless canonical header"

resp = client_with_test_fs.get("/html/2403.10561v1")
assert resp.status_code == 200 and b"Human-Centric" in resp.data
assert resp.headers.get('Link','') == "<https://arxiv.org/html/2403.10561>; rel='canonical'", "versionless canonical header"

resp = client_with_test_fs.get("/html/2403.10561v1/")
assert resp.status_code == 200 and b"Human-Centric" in resp.data
assert resp.headers.get('Link','') == "<https://arxiv.org/html/2403.10561>; rel='canonical'", "versionless canonical header"


def test_html_icon(client_with_test_fs):
resp = client_with_test_fs.get("/html/2403.10561")
assert resp.status_code == 200 and b"Human-Centric" in resp.data

resp = client_with_test_fs.get("/html/2403.10561/icon.png")
assert resp.status_code == 200 and resp.headers.get("Content-Type") == "image/png"
assert not("Link" in resp.headers), "assets of html articles don't have a canonical header (yet?)"


def test_html_paper_multi_files(client_with_test_fs):
Expand Down Expand Up @@ -113,6 +120,7 @@ def test_html_headers(client_with_test_fs):
assert 'Content-Type' in resp.headers
content_type = resp.headers.get('Content-Type', '')
assert content_type== "text/html; charset=utf-8"
assert resp.headers.get('Link', '') == "<https://arxiv.org/html/2403.10561>; rel='canonical'", "versionless canonical header"

#Surrogate Keys
rv=client_with_test_fs.head("/html/2403.10561")
Expand All @@ -125,6 +133,7 @@ def test_html_headers(client_with_test_fs):
assert "html-latexml" not in head

rv=client_with_test_fs.head("/html/cs/9904010v1/graph1.gif")
assert not('Link' in rv.headers), "HTML assets do not have a canonical header (yet?)"
head=rv.headers["Surrogate-Key"]
assert " html " in " "+head+" "
assert "html-cs/9904010-current" not in head
Expand Down

0 comments on commit 0ddb436

Please sign in to comment.