Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added lxml_html_clean and updated pdfplumber dependencies #143

Merged
merged 3 commits into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ reMarkable from any of the following sources:
* [ACL Web](https://www.aclweb.org/anthology/)
* [ACM Digital Library](https://dl.acm.org/dl.cfm)
* [CVF](https://openaccess.thecvf.com/menu)
* [DiVA](https://diva-portal.org/)
* [ECCC](https://eccc.weizmann.ac.il/reports/menu/)
* [IACR](https://eprint.iacr.org/)
* [JMLR](http://jmlr.org)
Expand Down
15 changes: 15 additions & 0 deletions paper2remarkable/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,18 @@ def __str__(self):
"as paper2remarkable gets blocked by CloudFlare.\n"
)
return msg

class FulltextMissingError(Error):
"""Exception raised when the fulltext PDF can't be found."""

def __init__(self, provider, url):
self.provider = provider
self.url = url

def __str__(self):
msg = (
"ERROR: Couldn't find the fulltext PDF for the following url:\n"
f"\t{self.provider}\n"
f"\t{self.url}\n"
)
return msg
2 changes: 2 additions & 0 deletions paper2remarkable/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .arxiv import Arxiv
from .citeseerx import CiteSeerX # disabled, incomplete html doc received
from .cvf import CVF
from .diva import DiVA
from .eccc import ECCC
from .html import HTML
from .iacr import IACR
Expand Down Expand Up @@ -33,6 +34,7 @@
ACM,
Arxiv,
CVF,
DiVA,
ECCC,
IACR,
JMLR,
Expand Down
76 changes: 76 additions & 0 deletions paper2remarkable/providers/diva.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-

"""Provider for DiVA - Digitala Vetenskapliga Arkivet

Author: G.J.J. van den Burg, Johan Holmberg
License: See LICENSE file
Copyright: 2019, 2024, G.J.J. van den Burg, Johan Holmberg

"""

import os
import re
import urllib.parse

import bs4

from ..exceptions import URLResolutionError, FulltextMissingError
from ..log import Logger
from ..utils import get_page_with_retry
from ._base import Provider
from ._info import Informer

logger = Logger()


class DiVAInformer(Informer):
def get_year(self, soup):
year = soup.find("meta", {"name": "citation_publication_date"}).get("content")
if not year:
logger.warning(
"Couldn't determine year information, maybe provide the desired filename using '--filename'?"
)
return ""
return year


class DiVA(Provider):
re_abs = "^https?://[a-z]+.diva-portal.org/smash/record.jsf"
re_pdf = "^https?://[a-z]+.diva-portal.org/smash/get/diva2:[0-9]+/FULLTEXT"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.informer = DiVAInformer()

def _get_doc_url(self, abs_url):
page = get_page_with_retry(abs_url)
soup = bs4.BeautifulSoup(page, "html.parser")

pdf_url = soup.find("meta", {"name": "citation_pdf_url"})
if pdf_url is None:
logger.warning(
"Couldn't find the fulltext URL"
)
raise FulltextMissingError("DiVA", abs_url)

return pdf_url.get("content")

def _get_abs_url(self, pdf_url):
diva_id = re.findall("diva2:[0-9]+", pdf_url)[0].split(":")[1]
url_candiate = re.findall("https?://[a-z]+.diva-portal.org/smash/", pdf_url)[0]
url_candiate += "record.jsf?pid=diva2%3A" + diva_id
return url_candiate

def get_abs_pdf_urls(self, url):
if re.match(self.re_abs, url):
abs_url = url
pdf_url = self._get_doc_url(url)
elif re.match(self.re_pdf, url):
abs_url = self._get_abs_url(url)
pdf_url = url
else:
raise URLResolutionError("DiVA", url)
return abs_url, pdf_url

def validate(src):
return re.match(DiVA.re_abs, src) or re.match(DiVA.re_pdf, src)
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@
REQUIRED = [
"beautifulsoup4>=4.8",
"html2text>=2020.1.16",
"lxml_html_clean>=0.1.1",
"markdown>=3.1.1",
"pdfplumber>=0.5",
"pdfplumber>=0.11",
"pikepdf>=2.9.0",
"pycryptodome",
"pyyaml>=5.1",
Expand Down
24 changes: 23 additions & 1 deletion tests/test_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@
from _constants import TEST_FILE
from pikepdf import Pdf

from paper2remarkable.exceptions import URLResolutionError
from paper2remarkable.exceptions import URLResolutionError, FulltextMissingError
from paper2remarkable.providers import ACL
from paper2remarkable.providers import ACM
from paper2remarkable.providers import CVF
from paper2remarkable.providers import DiVA
from paper2remarkable.providers import ECCC
from paper2remarkable.providers import HTML
from paper2remarkable.providers import IACR
Expand Down Expand Up @@ -552,6 +553,27 @@ def test_iacr_3(self):
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))

def test_diva_1(self):
# Testing redirections from Kungliga biblioteket
prov = DiVA(upload=False, verbose=VERBOSE)
url = "https://urn.kb.se/resolve?urn=urn:nbn:se:uu:diva-318796"
exp = "Lidayova_-_Fast_Methods_for_Vascular_Segmentation_Based_on_Approximate_Skeleton_Detection_2017.pdf"
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))

def test_diva_2(self):
# Testing absolute URLs and sanitization of filenames
prov = DiVA(upload=False, verbose=VERBOSE)
url = "https://www.diva-portal.org/smash/record.jsf?pid=diva2%3A1480467"
exp = "Alhussein_-_Privacy_by_Design_Amp_Internet_of_Things_Managing_Privacy_2018.pdf"
filename = prov.run(url)
self.assertEqual(exp, os.path.basename(filename))

def test_diva_3(self):
# Testing older entries without available fulltext
prov = DiVA(upload=False, verbose=VERBOSE)
url = "https://uu.diva-portal.org/smash/record.jsf?pid=diva2%3A59234"
self.assertRaises(FulltextMissingError, prov.run, url)

if __name__ == "__main__":
unittest.main()
Loading