Skip to content

Commit

Permalink
Merge pull request #55 from ezpaarse-project/feat/max-attempts
Browse files Browse the repository at this point in the history
feat: make max-attempts configurable and make it available with header
  • Loading branch information
nojhamster authored Nov 12, 2024
2 parents 7532559 + d9b1ca9 commit 7ba07be
Show file tree
Hide file tree
Showing 20 changed files with 160 additions and 119 deletions.
2 changes: 1 addition & 1 deletion crossref/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ You can use ezunpaywall with crossreft by placing it in front. This will save yo
+ **crossref-ttl** : Lifetime of cached documents, in seconds. Defaults to ``7 days (3600 * 24 * 7)``.
+ **crossref-throttle** : Minimum time to wait between queries, in milliseconds. Defaults to ``200``ms.
+ **crossref-paquet-size** : Maximum number of identifiers to send for query in a single request. Defaults to ``50``.
+ **crossref-buffer-size** : Maximum number of memorised access events before sending a request. Defaults to ``1000``.
+ **crossref-buffer-size** : Maximum number of memorized access events before sending a request. Defaults to ``1000``.
+ **crossref-max-tries** : Maximum number of attempts if an enrichment fails. Defaults to ``5``.
+ **crossref-on-fail** : Strategy to adopt if an enrichment reaches the maximum number of attempts. Can be either of ``abort``, ``ignore`` or ``retry``. Defaults to ``abort``.
+ **crossref-base-wait-time** : Time to wait before retrying after a query fails, in milliseconds. Defaults to ``1000``ms. This time ``doubles`` after each attempt.
Expand Down
4 changes: 3 additions & 1 deletion datacite/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ Your EC needs a DOI for enrichment.
+ **datacite-ttl** : Lifetime of cached documents, in seconds. Defaults to ``7 days (3600 * 24 * 7)``.
+ **datacite-throttle** : Minimum time to wait between queries, in milliseconds. Defaults to ``100``ms.
+ **datacite-paquet-size** : Maximum number of identifiers to send for query in a single request. Defaults to ``50``.
+ **datacite-buffer-size** : Maximum number of memorised access events before sending a request. Defaults to ``1000``.
+ **datacite-buffer-size** : Maximum number of memorized access events before sending a request. Defaults to ``1000``.
+ **datacite-max-attempts** : Maximum number of trials before passing the EC in error. Defaults to ``5``.


## How to use

Expand Down
4 changes: 3 additions & 1 deletion datacite/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,14 @@ module.exports = function () {
let packetSize = parseInt(req.header('datacite-packet-size'));
// Minimum number of ECs to keep before resolving them
let bufferSize = parseInt(req.header('datacite-buffer-size'));
// Maximum number of trials before passing the EC in error
let maxAttempts = parseInt(req.header('datacite-max-attempts'));

if (isNaN(packetSize)) { packetSize = 10; }
if (isNaN(bufferSize)) { bufferSize = 1000; }
if (isNaN(throttle)) { throttle = 100; }
if (isNaN(ttl)) { ttl = 3600 * 24 * 7; }
if (isNaN(maxAttempts)) { maxAttempts = 5; }

if (!cache) {
const err = new Error('failed to connect to mongodb, cache not available for Datacite');
Expand Down Expand Up @@ -87,7 +90,6 @@ module.exports = function () {

const dois = ecs.map(([ec, done]) => ec.doi);

const maxAttempts = 5;
let tries = 0;
let docs;

Expand Down
2 changes: 2 additions & 0 deletions eprints/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ Your EC needs a domain belonging to an eprint platform and a eprints ID.
+ **eprints-packet-size** : Maximum number of article to query
+ **eprints-buffer-size** : Minimum number of ECs to keep before resolving them
+ **eprints-domain-name** : Domain name eprints platform
+ **eprints-max-attempts** : Maximum number of trials before passing the EC in error. Defaults to ``5``.


## How to use

Expand Down
14 changes: 8 additions & 6 deletions eprints/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ module.exports = function eprints() {

const logger = this.logger;
const report = this.report;
const req = this.request;
const req = this.request;

const cacheEnabled = !/^false$/i.test(req.header('eprints-cache'));

Expand All @@ -35,13 +35,16 @@ module.exports = function eprints() {
let packetSize = parseInt(req.header('eprints-packet-size'));
// Minimum number of ECs to keep before resolving them
let bufferSize = parseInt(req.header('eprints-buffer-size'));
// Maximum number of trials before passing the EC in error
let maxAttempts = parseInt(req.header('eprints-max-attempts'));
// Domain name eprints platform
const domainName = req.header('eprints-domain-name');

if (isNaN(packetSize)) { packetSize = 10; }
if (isNaN(bufferSize)) { bufferSize = 200; }
if (isNaN(throttle)) { throttle = 100; }
if (isNaN(ttl)) { ttl = 3600 * 24 * 7; }
if (isNaN(maxAttempts)) { maxAttempts = 5; }

if (!cache) {
const err = new Error('failed to connect to mongodb, cache not available for eprints');
Expand Down Expand Up @@ -143,8 +146,7 @@ module.exports = function eprints() {
* @param {Object} ec the EC to process
* @param {Function} done the callback
*/
function* processEc (ec, done) {
const maxAttempts = 5;
function* processEc(ec, done) {
let tries = 0;
let result;

Expand Down Expand Up @@ -180,7 +182,7 @@ module.exports = function eprints() {
* Request metadata from OAI-PMH API for a given ID
* @param {String} id the id to query
*/
function query (id) {
function query(id) {
report.inc('general', 'eprints-queries');
return new Promise((resolve, reject) => {
const hostname = new URL(domainName).hostname;
Expand Down Expand Up @@ -238,9 +240,9 @@ module.exports = function eprints() {
* @param {Object} res the result to verify
*/
function verifFields(res) {
resultFields.forEach(function(field) {
resultFields.forEach(function (field) {
if (res.hasOwnProperty(field)) {
res =res[field][0];
res = res[field][0];
} else {
return false;
}
Expand Down
3 changes: 2 additions & 1 deletion ezunpaywall/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ You need an API key to use this service. You can use the **demo** apikey but it'
+ **ezunpaywall-ttl** : Lifetime of cached documents, in seconds. Defaults to ``7 days (3600 * 24 * 7)``.
+ **ezunpaywall-throttle** : Minimum time to wait between queries, in milliseconds. Defaults to ``100``ms.
+ **ezunpaywall-paquet-size** : Maximum number of identifiers to send for query in a single request. Defaults to ``100``.
+ **ezunpaywall-buffer-size** : Maximum number of memorised access events before sending a request. Defaults to ``1000``.
+ **ezunpaywall-buffer-size** : Maximum number of memorized access events before sending a request. Defaults to ``1000``.
+ **ezunpaywall-max-attempts** : Maximum number of trials before passing the EC in error. Defaults to ``5``.
+ **ezunpaywall-api-key** : apikey to use ezunpaywall.

## How to use
Expand Down
6 changes: 4 additions & 2 deletions ezunpaywall/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ module.exports = function () {

const logger = this.logger;
const report = this.report;
const req = this.request;
const req = this.request;

const cacheEnabled = !/^false$/i.test(req.header('ezunpaywall-cache'));

Expand All @@ -38,11 +38,14 @@ module.exports = function () {
let packetSize = parseInt(req.header('ezunpaywall-packet-size'));
// Minimum number of ECs to keep before resolving them
let bufferSize = parseInt(req.header('ezunpaywall-buffer-size'));
// Maximum number of trials before passing the EC in error
let maxAttempts = parseInt(req.header('ezunpaywall-max-attempts'));

if (isNaN(packetSize)) { packetSize = 100; }
if (isNaN(bufferSize)) { bufferSize = 1000; }
if (isNaN(throttle)) { throttle = 100; }
if (isNaN(ttl)) { ttl = 3600 * 24 * 7; }
if (isNaN(maxAttempts)) { maxAttempts = 5; }

if (!cache) {
const err = new Error('failed to connect to mongodb, cache not available for ezunpaywall');
Expand Down Expand Up @@ -107,7 +110,6 @@ module.exports = function () {

const dois = ecs.map(([ec, done]) => ec.doi);

const maxAttempts = 5;
let tries = 0;
let docs;

Expand Down
5 changes: 3 additions & 2 deletions hal/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ The HAL middleware uses the ``hal-identifier`` found in the access events to req
hal-cache
+ **hal-ttl** : Lifetime of cached documents, in seconds. Defaults to ``7 days (3600 * 24 * 7)``.
+ **hal-throttle** : Minimum time to wait between queries, in milliseconds. Defaults to ``500``.
+ **hal-paquet-size** :
+ **hal-buffer-siz** :
+ **hal-paquet-size** : Maximum number of identifiers to send for query in a single request. Defaults to ``100``.
+ **hal-buffer-size** : Maximum number of memorized access events before sending a request. Defaults to ``1000`
+ **hal-max-attempts** : Maximum number of trials before passing the EC in error. Defaults to ``5``.

## How to use

Expand Down
52 changes: 29 additions & 23 deletions hal/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,32 @@ try {
* Enrich ECs with hal data
*/
module.exports = function () {
const self = this;
const report = this.report;
const req = this.request;
const activated = (methal !== null) && /^true$/i.test(req.header('hal-enrich'));
const self = this;
const report = this.report;
const req = this.request;
const activated = (methal !== null) && /^true$/i.test(req.header('hal-enrich'));
const cacheEnabled = !/^false$/i.test(req.header('hal-cache'));

if (!activated) { return function (ec, next) { next(); }; }

self.logger.verbose('hal cache: %s', cacheEnabled ? 'enabled' : 'disabled');

const ttl = parseInt(req.header('hal-ttl')) || 3600 * 24 * 7;
const throttle = parseInt(req.header('hal-throttle')) || 100;
const packetSize = parseInt(req.header('hal-paquet-size')) || 150;
const maxAttempts = 5;
// Time-to-live of cached documents
let ttl = parseInt(req.header('hal-ttl'));
// Minimum wait time before each request (in ms)
let throttle = parseInt(req.header('hal-throttle'));
// Maximum number of ID to query
let packetSize = parseInt(req.header('hal-paquet-size'));
// Minimum number of ECs to keep before resolving them
let bufferSize = parseInt(req.header('hal-buffer-size'));
let bufferSize = parseInt(req.header('hal-buffer-size'));
// Maximum number of trials before passing the EC in error
let maxAttempts = parseInt(req.header('hal-max-attempts'));

if (isNaN(bufferSize)) {
bufferSize = 1000;
}
if (isNaN(packetSize)) { packetSize = 150; }
if (isNaN(bufferSize)) { bufferSize = 1000; }
if (isNaN(throttle)) { throttle = 100; }
if (isNaN(ttl)) { ttl = 3600 * 24 * 7; }
if (isNaN(maxAttempts)) { maxAttempts = 5; }

const buffer = [];
let busy = false;
Expand Down Expand Up @@ -309,24 +315,24 @@ module.exports = function () {
let sidDepot = null;

if (doc) {
ec['hal_docid'] = doc.docid;
ec['hal_identifiant'] = doc.halId_s;
ec['hal_docid'] = doc.docid;
ec['hal_identifiant'] = doc.halId_s;
ec['publication_title'] = (doc.title_s || [''])[0];
ec['hal_tampons'] = (doc.collId_i || []).join(',');
ec['hal_tampons_name'] = (doc.collCode_s || []).join(',');
ec['hal_domains'] = (doc.domain_s || []).join(',');
ec['hal_tampons'] = (doc.collId_i || []).join(',');
ec['hal_tampons_name'] = (doc.collCode_s || []).join(',');
ec['hal_domains'] = (doc.domain_s || []).join(',');

sidDepot = doc.sid_i;

// Formatage du document à mettre en cache
cacheDoc = [];
cacheDoc['hal_docid'] = ec.hal_docid;
cacheDoc['hal_identifiant'] = ec.hal_identifiant;
cacheDoc['hal_docid'] = ec.hal_docid;
cacheDoc['hal_identifiant'] = ec.hal_identifiant;
cacheDoc['publication_title'] = ec.publication_title;
cacheDoc['hal_tampons'] = ec.hal_tampons;
cacheDoc['hal_tampons_name'] = ec.hal_tampons_name;
cacheDoc['hal_domains'] = ec.hal_domains;
cacheDoc['hal_sid'] = sidDepot;
cacheDoc['hal_tampons'] = ec.hal_tampons;
cacheDoc['hal_tampons_name'] = ec.hal_tampons_name;
cacheDoc['hal_domains'] = ec.hal_domains;
cacheDoc['hal_sid'] = sidDepot;
}

let idTocache = identifiantOriginel || ec.hal_identifiant;
Expand Down
3 changes: 2 additions & 1 deletion istex/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ The ISTEX middleware uses the ``istex-identifier`` found in the access events to
+ **istex-ttl** : Lifetime of cached documents, in seconds. Defaults to ``7 days (3600 * 24 * 7)``.
+ **istex-throttle** : Minimum time to wait between queries, in milliseconds. Defaults to ``500``.
+ **istex-paquet-size** : Maximum number of identifiers to send for query in a single request. Defaults to ``50``.
+ **istex-buffer-size** : Maximum number of memorised access events before sending a request. Defaults to ``1000``.
+ **istex-buffer-size** : Maximum number of memorized access events before sending a request. Defaults to ``1000``.
+ **istex-max-attempts** : Maximum number of trials before passing the EC in error. Defaults to ``5``.

## How to use

Expand Down
80 changes: 42 additions & 38 deletions istex/index.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
'use strict';

const istex = require('node-istex').defaults({ extraQueryString: { sid: 'ezpaarse' }});
const co = require('co');
const data = require('./istex-rtype.json'); // matching between ezPAARSE and Istex types
const istex = require('node-istex').defaults({ extraQueryString: { sid: 'ezpaarse' } });
const co = require('co');
const data = require('./istex-rtype.json'); // matching between ezPAARSE and Istex types
const cache = ezpaarse.lib('cache')('istex');

const tiffCorpus = new Set(['EEBO', 'ECCO']);
Expand All @@ -24,21 +24,26 @@ const fields = [
* Enrich ECs with istex data
*/
module.exports = function () {
const self = this;
const report = this.report;
const req = this.request;
const activated = /^true$/i.test(req.header('istex-enrich'));
const self = this;
const report = this.report;
const req = this.request;
const activated = /^true$/i.test(req.header('istex-enrich'));
const cacheEnabled = !/^false$/i.test(req.header('istex-cache'));

if (!activated) { return function (ec, next) { next(); }; }

self.logger.verbose('Istex cache: %s', cacheEnabled ? 'enabled' : 'disabled');

const ttl = parseInt(req.header('istex-ttl')) || 3600 * 24 * 7;
const throttle = parseInt(req.header('istex-throttle')) || 100;
// Time-to-live of cached documents
const ttl = parseInt(req.header('istex-ttl')) || 3600 * 24 * 7;
// Minimum wait time before each request (in ms)
const throttle = parseInt(req.header('istex-throttle')) || 100;
// Maximum number of ID to query
const packetSize = parseInt(req.header('istex-paquet-size')) || 150;
// Minimum number of ECs to keep before resolving them
let bufferSize = parseInt(req.header('istex-buffer-size'));
let bufferSize = parseInt(req.header('istex-buffer-size'));
// Maximum number of trials before passing the EC in error
let maxAttempts = parseInt(req.header('istex-max-attempts'));

if (isNaN(bufferSize)) {
bufferSize = 1000;
Expand Down Expand Up @@ -166,7 +171,6 @@ module.exports = function () {
continue;
}

const maxAttempts = 5;
const results = new Map();
let tries = 0;
let list;
Expand Down Expand Up @@ -234,8 +238,8 @@ module.exports = function () {
report.inc('general', 'istex-queries');

const subQueries = [];
const istexIds = [];
const arks = [];
const istexIds = [];
const arks = [];

ids.forEach(id => {
/^ark:/i.test(id) ? arks.push(id) : istexIds.push(id);
Expand Down Expand Up @@ -280,12 +284,12 @@ module.exports = function () {
// We only cache what we need to limit memory usage
const cached = {
publicationDate: item.publicationDate,
copyrightDate: item.copyrightDate,
corpusName: item.corpusName,
language: item.language,
genre: item.genre,
host: item.host,
doi: item.doi
copyrightDate: item.copyrightDate,
corpusName: item.corpusName,
language: item.language,
genre: item.genre,
host: item.host,
doi: item.doi
};

cache.set(id, cached, (err, result) => {
Expand Down Expand Up @@ -319,32 +323,32 @@ module.exports = function () {
}

if (host) {
if (host.isbn) { ec['print_identifier'] = getValue(host.isbn); }
if (host.issn) { ec['print_identifier'] = getValue(host.issn); }
if (host.eisbn) { ec['online_identifier'] = getValue(host.eisbn); }
if (host.eissn) { ec['online_identifier'] = getValue(host.eissn); }
if (host.title) { ec['publication_title'] = getValue(host.title); }
if (host.isbn) { ec['print_identifier'] = getValue(host.isbn); }
if (host.issn) { ec['print_identifier'] = getValue(host.issn); }
if (host.eisbn) { ec['online_identifier'] = getValue(host.eisbn); }
if (host.eissn) { ec['online_identifier'] = getValue(host.eissn); }
if (host.title) { ec['publication_title'] = getValue(host.title); }
if (host.subject && host.subject.value) { ec['subject'] = getValue(host.subject).value; }
}

ec['publication_date'] = publicationDate || copyrightDate;

if (doi) { ec['doi'] = getValue(doi); }
if (arkIstex) { ec['ark'] = getValue(arkIstex); }
if (genre) { ec['istex_genre'] = getValue(genre); }
if (language) { ec['language'] = getValue(language); }
if (doi) { ec['doi'] = getValue(doi); }
if (arkIstex) { ec['ark'] = getValue(arkIstex); }
if (genre) { ec['istex_genre'] = getValue(genre); }
if (language) { ec['language'] = getValue(language); }

switch (ec['istex_rtype']) {
case 'fulltext':
ec['rtype'] = data[genre] || 'MISC';
break;
case 'metadata':
case 'enrichments':
case 'record':
ec['rtype'] = 'METADATA';
break;
default:
ec['rtype'] = 'MISC';
case 'fulltext':
ec['rtype'] = data[genre] || 'MISC';
break;
case 'metadata':
case 'enrichments':
case 'record':
ec['rtype'] = 'METADATA';
break;
default:
ec['rtype'] = 'MISC';
}
}
};
Expand Down
Loading

0 comments on commit 7ba07be

Please sign in to comment.