From ffa6a8e2e43300ccb26eebb46aeb798ff2af2f8f Mon Sep 17 00:00:00 2001 From: Stijn Date: Mon, 16 May 2022 17:40:16 +0200 Subject: [PATCH] Properly encode characters used in URL for arxiv and semanticscholar backends --- litstudy/sources/arxiv.py | 17 +++++++++-------- litstudy/sources/semanticscholar.py | 6 +++--- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/litstudy/sources/arxiv.py b/litstudy/sources/arxiv.py index 1e01aa2..a569eb4 100644 --- a/litstudy/sources/arxiv.py +++ b/litstudy/sources/arxiv.py @@ -2,6 +2,7 @@ from typing import Optional, List import feedparser # type: ignore from datetime import datetime +from urllib.parse import urlencode import time @@ -60,6 +61,8 @@ def category(self) -> Optional[List[str]]: '''returns arxiv category for article''' return self.entry.get('tags', None)[0].get('term', None) +# Base api query url +ARXIV_SEARCH_URL = 'http://export.arxiv.org/api/query' def search_arxiv(search_query, start=0, @@ -89,16 +92,14 @@ def search_arxiv(search_query, docs = list() - # Base api query url - base_url = 'http://export.arxiv.org/api/query?' - - print(f'Searching arXiv for {search_query}') - for i in range(start, total_results, results_per_iteration): - query = (f'search_query={search_query}&start={i}&max_results=' - f'{results_per_iteration}') + query = urlencode(dict( + search_query=search_query, + start=i, + max_results=results_per_iteration + )) - url = base_url + query + url = f'{ARXIV_SEARCH_URL}?{query}' data = feedparser.parse(url) for entry in data.entries: diff --git a/litstudy/sources/semanticscholar.py b/litstudy/sources/semanticscholar.py index 287519f..2aa8974 100644 --- a/litstudy/sources/semanticscholar.py +++ b/litstudy/sources/semanticscholar.py @@ -1,6 +1,6 @@ from time import sleep from typing import Tuple, Optional -from urllib.parse import quote_plus +from urllib.parse import urlencode import logging import requests import shelve @@ -115,11 +115,11 @@ def request_results(query, offset, cache, timeout=DEFAULT_TIMEOUT): def request_paper(key, cache, timeout=DEFAULT_TIMEOUT): - cache_key = f'paper={key}' + cache_key = urlencode(dict(paper=key)) if cache_key in cache: return cache[cache_key] - url = S2_PAPER_URL + quote_plus(key) + url = S2_PAPER_URL + cache_key try: sleep(timeout)