From 85fa986336c3f9d3e21d0d7313aed8c9a252f015 Mon Sep 17 00:00:00 2001 From: Lukas Schwab Date: Sun, 15 Oct 2023 19:26:28 -0700 Subject: [PATCH 1/6] Improve logging: use built-in formatting --- arxiv/arxiv.py | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/arxiv/arxiv.py b/arxiv/arxiv.py index 3cc05ed..30bbaf2 100644 --- a/arxiv/arxiv.py +++ b/arxiv/arxiv.py @@ -577,12 +577,7 @@ def results(self, search: Search, offset: int = 0) -> Generator[Result, None, No first_page = True while offset < total_results: page_size = min(self.page_size, search.max_results - offset) - logger.info( - "Requesting {} results at offset {}".format( - page_size, - offset, - ) - ) + logger.info("Requesting %d results at offset %d", page_size, offset) page_url = self._format_url(search, offset, page_size) feed = self._parse_feed(page_url, first_page) if first_page: @@ -591,16 +586,16 @@ def results(self, search: Search, offset: int = 0) -> Generator[Result, None, No # bug is fixed, we can remove this conditional and always set # `total_results = min(...)`. if len(feed.entries) == 0: - logger.info("Got empty results; stopping generation") + logger.info("Got empty first page; stopping generation") total_results = 0 else: total_results = min( total_results, int(feed.feed.opensearch_totalresults) ) logger.info( - "Got first page; {} of {} results available".format( - total_results, search.max_results - ) + "Got first page: %d of %d total results", + total_results, + search.max_results, ) # Subsequent pages are not the first page. first_page = False @@ -610,8 +605,8 @@ def results(self, search: Search, offset: int = 0) -> Generator[Result, None, No for entry in feed.entries: try: yield Result._from_feed_entry(entry) - except Result.MissingFieldError: - logger.warning("Skipping partial result") + except Result.MissingFieldError as e: + logger.warning("Skipping partial result: %s", e) continue def _format_url(self, search: Search, start: int, page_size: int) -> str: @@ -661,14 +656,14 @@ def __try_parse_feed( since_last_request = datetime.now() - self._last_request_dt if since_last_request < required: to_sleep = (required - since_last_request).total_seconds() - logger.info("Sleeping for %f seconds", to_sleep) + logger.info("Sleeping: %f seconds", to_sleep) time.sleep(to_sleep) logger.info( - "Requesting page of results", + "Requesting page (try %d): %s", + retry, + url, extra={ - "url": url, "first_page": first_page, - "retry": retry, "last_err": last_err.message if last_err is not None else None, }, ) @@ -680,6 +675,7 @@ def __try_parse_feed( elif len(feed.entries) == 0 and not first_page: err = UnexpectedEmptyPageError(url, retry) if err is not None: + logger.debug("Got error (try %d): %s", retry, err) if retries_left > 0: return self.__try_parse_feed( url, From 86f632d601993be14bd1b57ab82ab96f2f083937 Mon Sep 17 00:00:00 2001 From: Lukas Schwab Date: Sun, 15 Oct 2023 16:43:33 -0700 Subject: [PATCH 2/6] Verbose logging from pytest: log_cli --- setup.cfg | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.cfg b/setup.cfg index 77f75d0..fe8dac9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,3 +3,6 @@ description_file = README.md [tool:pytest] addopts = --verbose +log_cli = True +log_cli_level = INFO + From 179baa45e739d8b6aec6ffd06cf872667f5617d3 Mon Sep 17 00:00:00 2001 From: Lukas Schwab Date: Sun, 15 Oct 2023 19:41:44 -0700 Subject: [PATCH 3/6] Bodge: substitute -1 for float(inf) results --- arxiv/arxiv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arxiv/arxiv.py b/arxiv/arxiv.py index 30bbaf2..e2b30ce 100644 --- a/arxiv/arxiv.py +++ b/arxiv/arxiv.py @@ -595,7 +595,7 @@ def results(self, search: Search, offset: int = 0) -> Generator[Result, None, No logger.info( "Got first page: %d of %d total results", total_results, - search.max_results, + search.max_results if search.max_results != float("inf") else -1, ) # Subsequent pages are not the first page. first_page = False From 8b80def4d40d64dbfe5f56493013f99fbf9792c0 Mon Sep 17 00:00:00 2001 From: Lukas Schwab Date: Sun, 15 Oct 2023 19:45:23 -0700 Subject: [PATCH 4/6] Format --- arxiv/arxiv.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arxiv/arxiv.py b/arxiv/arxiv.py index e2b30ce..07ffd02 100644 --- a/arxiv/arxiv.py +++ b/arxiv/arxiv.py @@ -595,7 +595,9 @@ def results(self, search: Search, offset: int = 0) -> Generator[Result, None, No logger.info( "Got first page: %d of %d total results", total_results, - search.max_results if search.max_results != float("inf") else -1, + search.max_results + if search.max_results != float("inf") + else -1, ) # Subsequent pages are not the first page. first_page = False From 538198aba92fcc235576ca3ccf3fcd948e4d5980 Mon Sep 17 00:00:00 2001 From: Lukas Schwab Date: Sun, 15 Oct 2023 19:54:57 -0700 Subject: [PATCH 5/6] Test in CI: HTTP --- arxiv/arxiv.py | 2 +- tests/test_result.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arxiv/arxiv.py b/arxiv/arxiv.py index 07ffd02..d34cd1b 100644 --- a/arxiv/arxiv.py +++ b/arxiv/arxiv.py @@ -506,7 +506,7 @@ class Client(object): `Client.results`. """ - query_url_format = "https://export.arxiv.org/api/query?{}" + query_url_format = "http://export.arxiv.org/api/query?{}" """The arXiv query API endpoint format.""" page_size: int """Maximum number of results fetched in a single API request.""" diff --git a/tests/test_result.py b/tests/test_result.py index a26c7a0..365df12 100644 --- a/tests/test_result.py +++ b/tests/test_result.py @@ -43,7 +43,7 @@ def test_result_shape(self): def test_from_feed_entry(self): feed = arxiv.Client()._parse_feed( - "https://export.arxiv.org/api/query?search_query=testing" + "http://export.arxiv.org/api/query?search_query=testing" ) feed_entry = feed.entries[0] result = arxiv.Result._from_feed_entry(feed_entry) From a8cfebdfc80d70f10d61eb92e3d78a0ff21c3d01 Mon Sep 17 00:00:00 2001 From: Lukas Schwab Date: Sun, 15 Oct 2023 19:56:05 -0700 Subject: [PATCH 6/6] Revert "Test in CI: HTTP" This reverts commit 538198aba92fcc235576ca3ccf3fcd948e4d5980. --- arxiv/arxiv.py | 2 +- tests/test_result.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arxiv/arxiv.py b/arxiv/arxiv.py index d34cd1b..07ffd02 100644 --- a/arxiv/arxiv.py +++ b/arxiv/arxiv.py @@ -506,7 +506,7 @@ class Client(object): `Client.results`. """ - query_url_format = "http://export.arxiv.org/api/query?{}" + query_url_format = "https://export.arxiv.org/api/query?{}" """The arXiv query API endpoint format.""" page_size: int """Maximum number of results fetched in a single API request.""" diff --git a/tests/test_result.py b/tests/test_result.py index 365df12..a26c7a0 100644 --- a/tests/test_result.py +++ b/tests/test_result.py @@ -43,7 +43,7 @@ def test_result_shape(self): def test_from_feed_entry(self): feed = arxiv.Client()._parse_feed( - "http://export.arxiv.org/api/query?search_query=testing" + "https://export.arxiv.org/api/query?search_query=testing" ) feed_entry = feed.entries[0] result = arxiv.Result._from_feed_entry(feed_entry)