From 9f1283f263f8a968e2989fc8dc884118c0f94a3e Mon Sep 17 00:00:00 2001 From: felixleo22 Date: Fri, 8 Nov 2024 14:44:25 +0100 Subject: [PATCH 1/3] feat: make max-attempts configurable and make it available with header --- crossref/README.md | 2 +- datacite/README.md | 4 ++- datacite/index.js | 4 ++- eprints/README.md | 2 ++ eprints/index.js | 14 ++++---- ezunpaywall/README.md | 3 +- ezunpaywall/index.js | 6 ++-- hal/README.md | 5 +-- hal/index.js | 52 +++++++++++++++------------- istex/README.md | 3 +- istex/index.js | 80 +++++++++++++++++++++++-------------------- oej/bulk.js | 9 ++--- oej/index.js | 7 ++-- panist/README.md | 3 +- panist/index.js | 40 +++++++++++++--------- sudoc/README.md | 1 + sudoc/index.js | 19 ++++++---- unpaywall/README.md | 3 +- zotero/README.md | 3 +- zotero/index.js | 19 +++++----- 20 files changed, 161 insertions(+), 118 deletions(-) diff --git a/crossref/README.md b/crossref/README.md index 9203135..87b2e8d 100644 --- a/crossref/README.md +++ b/crossref/README.md @@ -38,7 +38,7 @@ You can use ezunpaywall with crossreft by placing it in front. This will save yo + **crossref-ttl** : Lifetime of cached documents, in seconds. Defaults to ``7 days (3600 * 24 * 7)``. + **crossref-throttle** : Minimum time to wait between queries, in milliseconds. Defaults to ``200``ms. + **crossref-paquet-size** : Maximum number of identifiers to send for query in a single request. Defaults to ``50``. -+ **crossref-buffer-size** : Maximum number of memorised access events before sending a request. Defaults to ``1000``. ++ **crossref-buffer-size** : Maximum number of memorized access events before sending a request. Defaults to ``1000``. + **crossref-max-tries** : Maximum number of attempts if an enrichment fails. Defaults to ``5``. + **crossref-on-fail** : Strategy to adopt if an enrichment reaches the maximum number of attempts. Can be either of ``abort``, ``ignore`` or ``retry``. Defaults to ``abort``. + **crossref-base-wait-time** : Time to wait before retrying after a query fails, in milliseconds. Defaults to ``1000``ms. This time ``doubles`` after each attempt. diff --git a/datacite/README.md b/datacite/README.md index 652433d..4f58fee 100644 --- a/datacite/README.md +++ b/datacite/README.md @@ -22,7 +22,9 @@ Your EC needs a DOI for enrichment. + **datacite-ttl** : Lifetime of cached documents, in seconds. Defaults to ``7 days (3600 * 24 * 7)``. + **datacite-throttle** : Minimum time to wait between queries, in milliseconds. Defaults to ``100``ms. + **datacite-paquet-size** : Maximum number of identifiers to send for query in a single request. Defaults to ``50``. -+ **datacite-buffer-size** : Maximum number of memorised access events before sending a request. Defaults to ``1000``. ++ **datacite-buffer-size** : Maximum number of memorized access events before sending a request. Defaults to ``1000``. ++ **datacite-max-attempts** : Maximum number of trials before passing the EC in error. Defaults to ``5``. + ## How to use diff --git a/datacite/index.js b/datacite/index.js index bbb8902..354c146 100644 --- a/datacite/index.js +++ b/datacite/index.js @@ -24,11 +24,14 @@ module.exports = function () { let packetSize = parseInt(req.header('datacite-packet-size')); // Minimum number of ECs to keep before resolving them let bufferSize = parseInt(req.header('datacite-buffer-size')); + // Maximum number of trials before passing the EC in error + let maxAttempts = parseInt(req.header('datacite-max-attempts')); if (isNaN(packetSize)) { packetSize = 10; } if (isNaN(bufferSize)) { bufferSize = 1000; } if (isNaN(throttle)) { throttle = 100; } if (isNaN(ttl)) { ttl = 3600 * 24 * 7; } + if (isNaN(maxAttempts)) { maxAttempts = 5; } if (!cache) { const err = new Error('failed to connect to mongodb, cache not available for Datacite'); @@ -87,7 +90,6 @@ module.exports = function () { const dois = ecs.map(([ec, done]) => ec.doi); - const maxAttempts = 5; let tries = 0; let docs; diff --git a/eprints/README.md b/eprints/README.md index 8d74e06..5a2da9a 100644 --- a/eprints/README.md +++ b/eprints/README.md @@ -28,6 +28,8 @@ Your EC needs a domain belonging to an eprint platform and a eprints ID. + **eprints-packet-size** : Maximum number of article to query + **eprints-buffer-size** : Minimum number of ECs to keep before resolving them + **eprints-domain-name** : Domain name eprints platform ++ **eprints-max-attempts** : Maximum number of trials before passing the EC in error. Defaults to ``5``. + ## How to use diff --git a/eprints/index.js b/eprints/index.js index 02d4a01..e839bcd 100644 --- a/eprints/index.js +++ b/eprints/index.js @@ -22,7 +22,7 @@ module.exports = function eprints() { const logger = this.logger; const report = this.report; - const req = this.request; + const req = this.request; const cacheEnabled = !/^false$/i.test(req.header('eprints-cache')); @@ -35,6 +35,8 @@ module.exports = function eprints() { let packetSize = parseInt(req.header('eprints-packet-size')); // Minimum number of ECs to keep before resolving them let bufferSize = parseInt(req.header('eprints-buffer-size')); + // Maximum number of trials before passing the EC in error + let maxAttempts = parseInt(req.header('eprints-max-attempts')); // Domain name eprints platform const domainName = req.header('eprints-domain-name'); @@ -42,6 +44,7 @@ module.exports = function eprints() { if (isNaN(bufferSize)) { bufferSize = 200; } if (isNaN(throttle)) { throttle = 100; } if (isNaN(ttl)) { ttl = 3600 * 24 * 7; } + if (isNaN(maxAttempts)) { maxAttempts = 5; } if (!cache) { const err = new Error('failed to connect to mongodb, cache not available for eprints'); @@ -143,8 +146,7 @@ module.exports = function eprints() { * @param {Object} ec the EC to process * @param {Function} done the callback */ - function* processEc (ec, done) { - const maxAttempts = 5; + function* processEc(ec, done) { let tries = 0; let result; @@ -180,7 +182,7 @@ module.exports = function eprints() { * Request metadata from OAI-PMH API for a given ID * @param {String} id the id to query */ - function query (id) { + function query(id) { report.inc('general', 'eprints-queries'); return new Promise((resolve, reject) => { const hostname = new URL(domainName).hostname; @@ -238,9 +240,9 @@ module.exports = function eprints() { * @param {Object} res the result to verify */ function verifFields(res) { - resultFields.forEach(function(field) { + resultFields.forEach(function (field) { if (res.hasOwnProperty(field)) { - res =res[field][0]; + res = res[field][0]; } else { return false; } diff --git a/ezunpaywall/README.md b/ezunpaywall/README.md index dd55071..f208071 100644 --- a/ezunpaywall/README.md +++ b/ezunpaywall/README.md @@ -28,7 +28,8 @@ You need an API key to use this service. You can use the **demo** apikey but it' + **ezunpaywall-ttl** : Lifetime of cached documents, in seconds. Defaults to ``7 days (3600 * 24 * 7)``. + **ezunpaywall-throttle** : Minimum time to wait between queries, in milliseconds. Defaults to ``100``ms. + **ezunpaywall-paquet-size** : Maximum number of identifiers to send for query in a single request. Defaults to ``100``. -+ **ezunpaywall-buffer-size** : Maximum number of memorised access events before sending a request. Defaults to ``1000``. ++ **ezunpaywall-buffer-size** : Maximum number of memorized access events before sending a request. Defaults to ``1000``. ++ **ezunpaywall-max-attempts** : Maximum number of trials before passing the EC in error. Defaults to ``5``. + **ezunpaywall-api-key** : apikey to use ezunpaywall. ## How to use diff --git a/ezunpaywall/index.js b/ezunpaywall/index.js index 01224a0..ff1b5a8 100644 --- a/ezunpaywall/index.js +++ b/ezunpaywall/index.js @@ -24,7 +24,7 @@ module.exports = function () { const logger = this.logger; const report = this.report; - const req = this.request; + const req = this.request; const cacheEnabled = !/^false$/i.test(req.header('ezunpaywall-cache')); @@ -38,11 +38,14 @@ module.exports = function () { let packetSize = parseInt(req.header('ezunpaywall-packet-size')); // Minimum number of ECs to keep before resolving them let bufferSize = parseInt(req.header('ezunpaywall-buffer-size')); + // Maximum number of trials before passing the EC in error + let maxAttempts = parseInt(req.header('ezunpaywall-max-attempts')); if (isNaN(packetSize)) { packetSize = 100; } if (isNaN(bufferSize)) { bufferSize = 1000; } if (isNaN(throttle)) { throttle = 100; } if (isNaN(ttl)) { ttl = 3600 * 24 * 7; } + if (isNaN(maxAttempts)) { maxAttempts = 5; } if (!cache) { const err = new Error('failed to connect to mongodb, cache not available for ezunpaywall'); @@ -107,7 +110,6 @@ module.exports = function () { const dois = ecs.map(([ec, done]) => ec.doi); - const maxAttempts = 5; let tries = 0; let docs; diff --git a/hal/README.md b/hal/README.md index b369b90..8dd576f 100644 --- a/hal/README.md +++ b/hal/README.md @@ -32,8 +32,9 @@ The HAL middleware uses the ``hal-identifier`` found in the access events to req hal-cache + **hal-ttl** : Lifetime of cached documents, in seconds. Defaults to ``7 days (3600 * 24 * 7)``. + **hal-throttle** : Minimum time to wait between queries, in milliseconds. Defaults to ``500``. -+ **hal-paquet-size** : -+ **hal-buffer-siz** : ++ **hal-paquet-size** : Maximum number of identifiers to send for query in a single request. Defaults to ``100``. ++ **hal-buffer-size** : Maximum number of memorized access events before sending a request. Defaults to ``1000` ++ **hal-max-attempts** : Maximum number of trials before passing the EC in error. Defaults to ``5``. ## How to use diff --git a/hal/index.js b/hal/index.js index 079c5d4..8962fdf 100644 --- a/hal/index.js +++ b/hal/index.js @@ -19,26 +19,32 @@ try { * Enrich ECs with hal data */ module.exports = function () { - const self = this; - const report = this.report; - const req = this.request; - const activated = (methal !== null) && /^true$/i.test(req.header('hal-enrich')); + const self = this; + const report = this.report; + const req = this.request; + const activated = (methal !== null) && /^true$/i.test(req.header('hal-enrich')); const cacheEnabled = !/^false$/i.test(req.header('hal-cache')); if (!activated) { return function (ec, next) { next(); }; } self.logger.verbose('hal cache: %s', cacheEnabled ? 'enabled' : 'disabled'); - const ttl = parseInt(req.header('hal-ttl')) || 3600 * 24 * 7; - const throttle = parseInt(req.header('hal-throttle')) || 100; - const packetSize = parseInt(req.header('hal-paquet-size')) || 150; - const maxAttempts = 5; + // Time-to-live of cached documents + let ttl = parseInt(req.header('hal-ttl')); + // Minimum wait time before each request (in ms) + let throttle = parseInt(req.header('hal-throttle')); + // Maximum number of ID to query + let packetSize = parseInt(req.header('hal-paquet-size')); // Minimum number of ECs to keep before resolving them - let bufferSize = parseInt(req.header('hal-buffer-size')); + let bufferSize = parseInt(req.header('hal-buffer-size')); + // Maximum number of trials before passing the EC in error + let maxAttempts = parseInt(req.header('hal-max-attempts')); - if (isNaN(bufferSize)) { - bufferSize = 1000; - } + if (isNaN(packetSize)) { packetSize = 150; } + if (isNaN(bufferSize)) { bufferSize = 1000; } + if (isNaN(throttle)) { throttle = 100; } + if (isNaN(ttl)) { ttl = 3600 * 24 * 7; } + if (isNaN(maxAttempts)) { maxAttempts = 5; } const buffer = []; let busy = false; @@ -309,24 +315,24 @@ module.exports = function () { let sidDepot = null; if (doc) { - ec['hal_docid'] = doc.docid; - ec['hal_identifiant'] = doc.halId_s; + ec['hal_docid'] = doc.docid; + ec['hal_identifiant'] = doc.halId_s; ec['publication_title'] = (doc.title_s || [''])[0]; - ec['hal_tampons'] = (doc.collId_i || []).join(','); - ec['hal_tampons_name'] = (doc.collCode_s || []).join(','); - ec['hal_domains'] = (doc.domain_s || []).join(','); + ec['hal_tampons'] = (doc.collId_i || []).join(','); + ec['hal_tampons_name'] = (doc.collCode_s || []).join(','); + ec['hal_domains'] = (doc.domain_s || []).join(','); sidDepot = doc.sid_i; // Formatage du document à mettre en cache cacheDoc = []; - cacheDoc['hal_docid'] = ec.hal_docid; - cacheDoc['hal_identifiant'] = ec.hal_identifiant; + cacheDoc['hal_docid'] = ec.hal_docid; + cacheDoc['hal_identifiant'] = ec.hal_identifiant; cacheDoc['publication_title'] = ec.publication_title; - cacheDoc['hal_tampons'] = ec.hal_tampons; - cacheDoc['hal_tampons_name'] = ec.hal_tampons_name; - cacheDoc['hal_domains'] = ec.hal_domains; - cacheDoc['hal_sid'] = sidDepot; + cacheDoc['hal_tampons'] = ec.hal_tampons; + cacheDoc['hal_tampons_name'] = ec.hal_tampons_name; + cacheDoc['hal_domains'] = ec.hal_domains; + cacheDoc['hal_sid'] = sidDepot; } let idTocache = identifiantOriginel || ec.hal_identifiant; diff --git a/istex/README.md b/istex/README.md index 27a4b22..7039fa7 100644 --- a/istex/README.md +++ b/istex/README.md @@ -32,7 +32,8 @@ The ISTEX middleware uses the ``istex-identifier`` found in the access events to + **istex-ttl** : Lifetime of cached documents, in seconds. Defaults to ``7 days (3600 * 24 * 7)``. + **istex-throttle** : Minimum time to wait between queries, in milliseconds. Defaults to ``500``. + **istex-paquet-size** : Maximum number of identifiers to send for query in a single request. Defaults to ``50``. -+ **istex-buffer-size** : Maximum number of memorised access events before sending a request. Defaults to ``1000``. ++ **istex-buffer-size** : Maximum number of memorized access events before sending a request. Defaults to ``1000``. ++ **istex-max-attempts** : Maximum number of trials before passing the EC in error. Defaults to ``5``. ## How to use diff --git a/istex/index.js b/istex/index.js index 0852050..1839221 100644 --- a/istex/index.js +++ b/istex/index.js @@ -1,8 +1,8 @@ 'use strict'; -const istex = require('node-istex').defaults({ extraQueryString: { sid: 'ezpaarse' }}); -const co = require('co'); -const data = require('./istex-rtype.json'); // matching between ezPAARSE and Istex types +const istex = require('node-istex').defaults({ extraQueryString: { sid: 'ezpaarse' } }); +const co = require('co'); +const data = require('./istex-rtype.json'); // matching between ezPAARSE and Istex types const cache = ezpaarse.lib('cache')('istex'); const tiffCorpus = new Set(['EEBO', 'ECCO']); @@ -11,21 +11,26 @@ const tiffCorpus = new Set(['EEBO', 'ECCO']); * Enrich ECs with istex data */ module.exports = function () { - const self = this; - const report = this.report; - const req = this.request; - const activated = /^true$/i.test(req.header('istex-enrich')); + const self = this; + const report = this.report; + const req = this.request; + const activated = /^true$/i.test(req.header('istex-enrich')); const cacheEnabled = !/^false$/i.test(req.header('istex-cache')); if (!activated) { return function (ec, next) { next(); }; } self.logger.verbose('Istex cache: %s', cacheEnabled ? 'enabled' : 'disabled'); - const ttl = parseInt(req.header('istex-ttl')) || 3600 * 24 * 7; - const throttle = parseInt(req.header('istex-throttle')) || 100; + // Time-to-live of cached documents + const ttl = parseInt(req.header('istex-ttl')) || 3600 * 24 * 7; + // Minimum wait time before each request (in ms) + const throttle = parseInt(req.header('istex-throttle')) || 100; + // Maximum number of ID to query const packetSize = parseInt(req.header('istex-paquet-size')) || 150; // Minimum number of ECs to keep before resolving them - let bufferSize = parseInt(req.header('istex-buffer-size')); + let bufferSize = parseInt(req.header('istex-buffer-size')); + // Maximum number of trials before passing the EC in error + let maxAttempts = parseInt(req.header('istex-max-attempts')); if (isNaN(bufferSize)) { bufferSize = 1000; @@ -153,7 +158,6 @@ module.exports = function () { continue; } - const maxAttempts = 5; const results = new Map(); let tries = 0; let list; @@ -221,8 +225,8 @@ module.exports = function () { report.inc('general', 'istex-queries'); const subQueries = []; - const istexIds = []; - const arks = []; + const istexIds = []; + const arks = []; ids.forEach(id => { /^ark:/i.test(id) ? arks.push(id) : istexIds.push(id); @@ -263,12 +267,12 @@ module.exports = function () { // We only cache what we need to limit memory usage const cached = { publicationDate: item.publicationDate, - copyrightDate: item.copyrightDate, - corpusName: item.corpusName, - language: item.language, - genre: item.genre, - host: item.host, - doi: item.doi + copyrightDate: item.copyrightDate, + corpusName: item.corpusName, + language: item.language, + genre: item.genre, + host: item.host, + doi: item.doi }; cache.set(id, cached, (err, result) => { @@ -302,32 +306,32 @@ module.exports = function () { } if (host) { - if (host.isbn) { ec['print_identifier'] = getValue(host.isbn); } - if (host.issn) { ec['print_identifier'] = getValue(host.issn); } - if (host.eisbn) { ec['online_identifier'] = getValue(host.eisbn); } - if (host.eissn) { ec['online_identifier'] = getValue(host.eissn); } - if (host.title) { ec['publication_title'] = getValue(host.title); } + if (host.isbn) { ec['print_identifier'] = getValue(host.isbn); } + if (host.issn) { ec['print_identifier'] = getValue(host.issn); } + if (host.eisbn) { ec['online_identifier'] = getValue(host.eisbn); } + if (host.eissn) { ec['online_identifier'] = getValue(host.eissn); } + if (host.title) { ec['publication_title'] = getValue(host.title); } if (host.subject && host.subject.value) { ec['subject'] = getValue(host.subject).value; } } ec['publication_date'] = publicationDate || copyrightDate; - if (doi) { ec['doi'] = getValue(doi); } - if (arkIstex) { ec['ark'] = getValue(arkIstex); } - if (genre) { ec['istex_genre'] = getValue(genre); } - if (language) { ec['language'] = getValue(language); } + if (doi) { ec['doi'] = getValue(doi); } + if (arkIstex) { ec['ark'] = getValue(arkIstex); } + if (genre) { ec['istex_genre'] = getValue(genre); } + if (language) { ec['language'] = getValue(language); } switch (ec['istex_rtype']) { - case 'fulltext': - ec['rtype'] = data[genre] || 'MISC'; - break; - case 'metadata': - case 'enrichments': - case 'record': - ec['rtype'] = 'METADATA'; - break; - default: - ec['rtype'] = 'MISC'; + case 'fulltext': + ec['rtype'] = data[genre] || 'MISC'; + break; + case 'metadata': + case 'enrichments': + case 'record': + ec['rtype'] = 'METADATA'; + break; + default: + ec['rtype'] = 'MISC'; } } }; diff --git a/oej/bulk.js b/oej/bulk.js index 5c01679..dd653e1 100644 --- a/oej/bulk.js +++ b/oej/bulk.js @@ -8,9 +8,9 @@ const cache = ezpaarse.lib('cache')('oej'); module.exports = function () { this.logger.verbose('Initializing OEJ'); - const logger = this.logger; - const report = this.report; - const req = this.request; + const logger = this.logger; + const report = this.report; + const req = this.request; const cacheEnabled = !/^false$/i.test(req.header('oej-cache')); logger.verbose(`OEJ cache: ${cacheEnabled ? 'enabled' : 'disabled'}`); @@ -23,6 +23,8 @@ module.exports = function () { let packetSize = parseInt(req.header('oej-paquet-size')); // Minimum number of ECs to keep before resolving them let bufferSize = parseInt(req.header('oej-buffer-size')); + // Maximum number of trials before passing the EC in error + let maxAttempts = parseInt(req.header('oej-max-attempts')); if (isNaN(bufferSize)) { bufferSize = 1000; } if (isNaN(packetSize)) { packetSize = 150; } @@ -89,7 +91,6 @@ module.exports = function () { function* onPacket({ ecs, groups }) { const ids = Array.from(groups.keys()); // Set of different Sitename/LodelID couples - const maxAttempts = 5; const results = new Map(); let tries = 0; let list; diff --git a/oej/index.js b/oej/index.js index 375324f..4599941 100644 --- a/oej/index.js +++ b/oej/index.js @@ -10,7 +10,7 @@ module.exports = function () { const logger = this.logger; const report = this.report; - const req = this.request; + const req = this.request; const cacheEnabled = !/^false$/i.test(req.header('oej-cache')); @@ -24,6 +24,8 @@ module.exports = function () { let packetSize = parseInt(req.header('oej-paquet-size')); // Minimum number of ECs to keep before resolving them let bufferSize = parseInt(req.header('oej-buffer-size')); + // Maximum number of trials before passing the EC in error + let maxAttempts = parseInt(req.header('oej-max-attempts')); if (isNaN(bufferSize)) { bufferSize = 1000; } if (isNaN(packetSize)) { packetSize = 150; } @@ -88,10 +90,9 @@ module.exports = function () { for (const [ec, done] of ecs) { report.inc('general', 'oej-accesses'); - const lodelid = ec.lodelid; + const lodelid = ec.lodelid; const sitename = ec.title_id; - const maxAttempts = 5; let tries = 0; let doc; diff --git a/panist/README.md b/panist/README.md index 84dbbad..e5fd080 100644 --- a/panist/README.md +++ b/panist/README.md @@ -30,7 +30,8 @@ The ISTEX middleware uses the ``istex-identifier`` found in the access events to + **panist-ttl** : Lifetime of cached documents, in seconds. Defaults to ``7 days (3600 * 24 * 7)``. + **panist-throttle** : Minimum time to wait between queries, in milliseconds. Defaults to ``200``ms. + **panist-paquet-size** : Maximum number of identifiers to send for query in a single request. Defaults to ``50``. -+ **panist-buffer-size** : Maximum number of memorised access events before sending a request. Defaults to ``1000``. ++ **panist-buffer-size** : Maximum number of memorized access events before sending a request. Defaults to ``1000``. ++ **panist-max-attempts** : Maximum number of trials before passing the EC in error. Defaults to ``5``. ## How to use diff --git a/panist/index.js b/panist/index.js index 143de13..0e83a10 100644 --- a/panist/index.js +++ b/panist/index.js @@ -32,15 +32,22 @@ module.exports = function () { self.logger.verbose('Panist cache: %s', cacheEnabled ? 'enabled' : 'disabled'); - const ttl = parseInt(req.header('panist-ttl')) || 3600 * 24 * 7; - const throttle = parseInt(req.header('panist-throttle')) || 100; - const packetSize = parseInt(req.header('panist-paquet-size')) || 150; + // Time-to-live of cached documents + let ttl = parseInt(req.header('panist-ttl')); + // Minimum wait time before each request (in ms) + let throttle = parseInt(req.header('panist-throttle')); + // Maximum number of ID to query + let packetSize = parseInt(req.header('panist-paquet-size')); // Minimum number of ECs to keep before resolving them let bufferSize = parseInt(req.header('panist-buffer-size')); + // Maximum number of trials before passing the EC in error + let maxAttempts = parseInt(req.header('panist-max-attempts')); - if (isNaN(bufferSize)) { - bufferSize = 1000; - } + if (isNaN(bufferSize)) { bufferSize = 1000; } + if (isNaN(packetSize)) { packetSize = 150; } + if (isNaN(throttle)) { throttle = 100; } + if (isNaN(ttl)) { ttl = 3600 * 24 * 7; } + if (isNaN(maxAttempts)) { maxAttempts = 5; } const buffer = []; let busy = false; @@ -164,7 +171,6 @@ module.exports = function () { continue; } - const maxAttempts = 5; const results = new Map(); let tries = 0; let list; @@ -333,16 +339,16 @@ module.exports = function () { if (language) { ec['language'] = getValue(language); } switch (ec['istex_rtype']) { - case 'fulltext': - ec['rtype'] = data[genre] || 'MISC'; - break; - case 'metadata': - case 'enrichments': - case 'record': - ec['rtype'] = 'METADATA'; - break; - default: - ec['rtype'] = 'MISC'; + case 'fulltext': + ec['rtype'] = data[genre] || 'MISC'; + break; + case 'metadata': + case 'enrichments': + case 'record': + ec['rtype'] = 'METADATA'; + break; + default: + ec['rtype'] = 'MISC'; } } }; diff --git a/sudoc/README.md b/sudoc/README.md index 1d8df49..5ee6be1 100644 --- a/sudoc/README.md +++ b/sudoc/README.md @@ -21,6 +21,7 @@ Your EC needs a print_identifier for enrichment. + **sudoc-enrich** : Set to ``false`` to disable sudoc enrichment. Enabled by default. + **sudoc-ttl** : Lifetime of cached documents, in seconds. Defaults to ``7 days (3600 * 24 * 7)``. + **sudoc-throttle** : Minimum time to wait between queries, in milliseconds. Defaults to ``200``ms. ++ **sudoc-max-attempts** : Maximum number of trials before passing the EC in error. Defaults to ``5``. ## How to use diff --git a/sudoc/index.js b/sudoc/index.js index d7a4a65..7f005c5 100644 --- a/sudoc/index.js +++ b/sudoc/index.js @@ -11,19 +11,25 @@ const isbnPattern = /^(97(8|9))?\d{9}(\d|X)$/i; */ module.exports = function () { const activated = (this.request.header('sudoc-enrich') || '').toLowerCase() === 'true'; - const throttle = parseInt(this.request.header('sudoc-throttle')) || 500; - const ttl = parseInt(this.request.header('sudoc-ttl')) || 3600 * 24 * 7; + let throttle = parseInt(this.request.header('sudoc-throttle')); + let ttl = parseInt(this.request.header('sudoc-ttl')); + let maxAttempts = parseInt(this.request.header('sudoc-max-attempts')); + + if (isNaN(packetSize)) { packetSize = 150; } + if (isNaN(throttle)) { throttle = 500; } + if (isNaN(ttl)) { ttl = 3600 * 24 * 7; } + if (isNaN(maxAttempts)) { maxAttempts = 5; } if (!activated) { this.logger.verbose('Sudoc enrichment not activated'); return function (ec, next) { next(); }; } - const self = this; - const report = this.report; + const self = this; + const report = this.report; const pending = new Map(); - const buffer = []; - let busy = false; + const buffer = []; + let busy = false; self.logger.verbose('Sudoc enrichment activated'); self.logger.verbose('Sudoc throttle: %dms', throttle); @@ -63,7 +69,6 @@ module.exports = function () { return pullBuffer(); } - const maxAttempts = 5; let tries = 0; (function querySudoc() { diff --git a/unpaywall/README.md b/unpaywall/README.md index 28b5cfe..7a3ec66 100644 --- a/unpaywall/README.md +++ b/unpaywall/README.md @@ -28,10 +28,11 @@ This API is limited to **100 000** DOIs per day for everyone. It is necessary to + **unpaywall-TTL** : Lifetime of cached documents, in seconds. Defaults to ``7 days (3600 * 24 * 7)`` + **unpaywall-throttle** : Minimum time to wait between each query, in milliseconds. Defaults to ``100``ms. Throttle time ``doubles`` after each failed attempt. + **unpaywall-paquet-size** : Maximum number of DOIs to request in parallel. Defaults to ``10`` -+ **unpaywall-buffer-size** : Maximum number of memorised access events before sending requests. Defaults to ``200`` ++ **unpaywall-buffer-size** : Maximum number of memorized access events before sending requests. Defaults to ``200`` + **unpaywall-max-tries** : Maximum number of attempts if an enrichment fails. Defaults to ``5``. + **unpaywall-on-fail** : Strategy to adopt if an enrichment reaches the maximum number of attempts. Can be either of ``abort``, ``ignore`` or ``retry``. Defaults to ``abort``. + **unpaywall-email** : The email to use for API calls. Defaults to ``YOUR_EMAIL``. ++ **unpaywall-max-attempts** : Maximum number of trials before passing the EC in error. Defaults to ``5``. ## How to use diff --git a/zotero/README.md b/zotero/README.md index 9b068fc..ab30eb3 100644 --- a/zotero/README.md +++ b/zotero/README.md @@ -18,7 +18,8 @@ Enriches consultation events with [zotero] + **zotero-ttl** : Lifetime of cached documents, in seconds. Defaults to ``7 days (3600 * 24 * 7)``. + **zotero-throttle** : Minimum time to wait between queries, in milliseconds. Defaults to ``200``ms. + **zotero-paquet-size** : Maximum number of identifiers to send for query in a single request. Defaults to ``50``. -+ **zotero-buffer-size** : Maximum number of memorised access events before sending a request. Defaults to ``1000``. ++ **zotero-buffer-size** : Maximum number of memorized access events before sending a request. Defaults to ``1000``. ++ **zotero-max-attempts** : Maximum number of trials before passing the EC in error. Defaults to ``5``. ## Prerequisites diff --git a/zotero/index.js b/zotero/index.js index 0375f2a..25a27f2 100644 --- a/zotero/index.js +++ b/zotero/index.js @@ -17,7 +17,7 @@ module.exports = function () { const logger = this.logger; const report = this.report; - const req = this.request; + const req = this.request; const cacheEnabled = !/^false$/i.test(req.header('zotero-cache')); @@ -31,11 +31,15 @@ module.exports = function () { let packetSize = parseInt(req.header('zotero-packet-size')); // Minimum number of ECs to keep before resolving them let bufferSize = parseInt(req.header('zotero-buffer-size')); + // Maximum number of trials before passing the EC in error + let maxAttempts = parseInt(req.header('zotero-max-attempts')); + if (isNaN(packetSize)) { packetSize = 10; } if (isNaN(bufferSize)) { bufferSize = 200; } if (isNaN(throttle)) { throttle = 100; } if (isNaN(ttl)) { ttl = 3600 * 24 * 7; } + if (isNaN(maxAttempts)) { maxAttempts = 5; } if (!cache) { const err = new Error('failed to connect to mongodb, cache not available for Zotero'); @@ -118,8 +122,7 @@ module.exports = function () { * @param {Object} ec the EC to process * @param {Function} done the callback */ - function* processEc (ec, done) { - const maxAttempts = 5; + function* processEc(ec, done) { let tries = 0; let result; @@ -198,11 +201,11 @@ module.exports = function () { }); } - /** - * Cache an item with a given ID - * @param {String} id the ID of the item - * @param {Object} item the item to cache - */ + /** + * Cache an item with a given ID + * @param {String} id the ID of the item + * @param {Object} item the item to cache + */ function cacheResult(id, item) { return new Promise((resolve, reject) => { if (!id || !item) { return resolve(); } From 5783d0c6aadbf09e73f96941e9ef98ac5a03f9a2 Mon Sep 17 00:00:00 2001 From: felixleo22 Date: Fri, 8 Nov 2024 14:51:34 +0100 Subject: [PATCH 2/3] fix: remove useless code --- sudoc/index.js | 1 - 1 file changed, 1 deletion(-) diff --git a/sudoc/index.js b/sudoc/index.js index 7f005c5..bf700d7 100644 --- a/sudoc/index.js +++ b/sudoc/index.js @@ -15,7 +15,6 @@ module.exports = function () { let ttl = parseInt(this.request.header('sudoc-ttl')); let maxAttempts = parseInt(this.request.header('sudoc-max-attempts')); - if (isNaN(packetSize)) { packetSize = 150; } if (isNaN(throttle)) { throttle = 500; } if (isNaN(ttl)) { ttl = 3600 * 24 * 7; } if (isNaN(maxAttempts)) { maxAttempts = 5; } From d9b1ca967c8cbf1dbef5e9ee4ea57ed2ca7372d5 Mon Sep 17 00:00:00 2001 From: Yannick Schurter Date: Tue, 12 Nov 2024 09:58:07 +0100 Subject: [PATCH 3/3] doc(ezunpaywall): remove unexisting max-attempts header --- unpaywall/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/unpaywall/README.md b/unpaywall/README.md index 7a3ec66..7baecd8 100644 --- a/unpaywall/README.md +++ b/unpaywall/README.md @@ -32,7 +32,6 @@ This API is limited to **100 000** DOIs per day for everyone. It is necessary to + **unpaywall-max-tries** : Maximum number of attempts if an enrichment fails. Defaults to ``5``. + **unpaywall-on-fail** : Strategy to adopt if an enrichment reaches the maximum number of attempts. Can be either of ``abort``, ``ignore`` or ``retry``. Defaults to ``abort``. + **unpaywall-email** : The email to use for API calls. Defaults to ``YOUR_EMAIL``. -+ **unpaywall-max-attempts** : Maximum number of trials before passing the EC in error. Defaults to ``5``. ## How to use @@ -81,4 +80,4 @@ curl -X POST -v http://localhost:59599 \ -H "Log-Format-Ezproxy: " \ -F "file=@" -``` \ No newline at end of file +```