Skip to content

Commit

Permalink
Support multiple parsers (#16)
Browse files Browse the repository at this point in the history
* wip

* add ACL anthology URL to allow CORS from chrome extension

* add ACL anthology parser
  • Loading branch information
denkiwakame authored May 28, 2024
1 parent 26b01d7 commit aed8763
Show file tree
Hide file tree
Showing 3 changed files with 156 additions and 89 deletions.
3 changes: 2 additions & 1 deletion manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
"host_permissions": [
"*://api.notion.com/*",
"*://www.notion.so/*",
"*://openreview.net/*"
"*://openreview.net/*",
"*://aclanthology.org/*"
],
"content_security_policy": {
"extension_pages": "script-src 'self'; object-src 'self'"
Expand Down
147 changes: 147 additions & 0 deletions src/js/parsers.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
// MIT License
// Copyright (c) 2024 denkiwakame <[email protected]>

class URLParser {
constructor() {
this.parsers = [];
}

addParser(domain, handler) {
this.parsers.push({ domain, handler });
}

async parse(url) {
for (let { domain, handler } of this.parsers) {
if (url?.startsWith(domain)) return handler(url);
}
throw new Error('No perser found for the given URL');
}
}

const arXivParser = async (url) => {
const ARXIV_API = 'http://export.arxiv.org/api/query/search_query';
// ref: https://info.arxiv.org/help/arxiv_identifier.html
// e.g. (new id format: 2404.16782) | (old id format: hep-th/0702063)
const parseArXivId = (str) => str.match(/(\d+\.\d+$)|((\w|-)+\/\d+$)/)?.[0];

const paperId = parseArXivId(url);
const res = await fetch(ARXIV_API + '?id_list=' + paperId.toString());
if (res.status != 200) {
console.error('arXiv API request failed');
return;
}
const data = await res.text(); // TODO: error handling
console.log(res.status);
const xmlData = new window.DOMParser().parseFromString(data, 'text/xml');
console.log(xmlData);

const entry = xmlData.querySelector('entry');
const id = parseArXivId(entry.querySelector('id')?.textContent);
const paperTitle = entry.querySelector('title').textContent;
const abst = entry.querySelector('summary').textContent;
const authors = Array.from(entry.querySelectorAll('author')).map((author) => {
return author.textContent.trim();
});
const published = entry.querySelector('published').textContent;
const comment = entry.querySelector('comment')?.textContent ?? 'none';

return {
id: id,
title: paperTitle,
abst: abst,
authors: authors,
url: url,
published: published,
comment: comment,
publisher: 'arXiv',
};
};

const openReviewParser = async (url) => {
const id = new URLSearchParams(new URL(url).search).get('id');
const res = await fetch(url);
const html = await res.text();
const parser = new DOMParser();
const xml = parser.parseFromString(html, 'text/html');

const authorsArray = Array.from(
xml.querySelectorAll('meta[name="citation_author"]'),
(author) => author.getAttribute('content')
);
const authors = authorsArray.length ? authorsArray : ['Anonymous'];

const paperTitle = xml
.querySelector('meta[name="citation_title"]')
.getAttribute('content');

const abst = xml
.querySelector('meta[name="citation_abstract"]')
.getAttribute('content');

const date = xml
.querySelector('meta[name="citation_online_date"]')
.getAttribute('content');
// -> ISO 8601 date string
const published = new Date(date).toISOString().split('T')[0];
const comment = 'none';

return {
id: id,
title: paperTitle,
abst: abst,
authors: authors,
url: url,
published: published,
comment: comment,
publisher: 'OpenReview',
};
};

const aclAnthologyParser = async (url) => {
const res = await fetch(url);
const html = await res.text();
const parser = new DOMParser();
const xml = parser.parseFromString(html, 'text/html');

const id = xml
.querySelector('meta[name="citation_doi"]')
.getAttribute('content');
const authors = Array.from(
xml.querySelectorAll('meta[name="citation_author"]'),
(author) => author.getAttribute('content')
);

const paperTitle = xml
.querySelector('meta[name="citation_title"]')
.getAttribute('content');

const abst = 'none';
const date = xml
.querySelector('meta[name="citation_publication_date"]')
.getAttribute('content');
// -> ISO 8601 date string
const published = new Date(date).toISOString().split('T')[0];
const publisher = xml
.querySelectorAll('.acl-paper-details dd')[6]
.textContent.replaceAll('\n', '');
const comment = xml
.querySelector('meta[name="citation_pdf_url"]')
.getAttribute('content');
return {
id: id,
title: paperTitle,
abst: abst,
authors: authors,
url: url,
published: published,
comment: comment,
publisher: publisher,
};
};

const urlParser = new URLParser();
urlParser.addParser('https://openreview.net/', openReviewParser);
urlParser.addParser('https://arxiv.org', arXivParser);
urlParser.addParser('https://aclanthology.org', aclAnthologyParser);

export default urlParser;
95 changes: 7 additions & 88 deletions src/js/popup.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@ import Icons from 'uikit/dist/js/uikit-icons';
import Mustache from 'mustache';
import NotionClient from './notion.js';
import thenChrome from 'then-chrome';
import urlParser from './parsers.js';

UIKit.use(Icons);

const TEST_URL = 'https://arxiv.org/abs/2308.04079';
const ARXIV_API = 'http://export.arxiv.org/api/query/search_query';
// const TEST_URL = 'https://aclanthology.org/2023.ijcnlp-main.1/';

class UI {
constructor() {
this.setupProgressBar();
Expand Down Expand Up @@ -97,13 +99,11 @@ class UI {
return url && url.split('.').pop() === 'pdf';
}
async getPaperInfo(url) {
if (this.isArxivUrl(url)) return this.getArXivInfo(url);
if (this.isOpenReviewUrl(url)) return this.getOpenReviewInfo(url);
this.showProgressBar();
const data = await urlParser.parse(url);
this.setFormContents(data.title, data.abst, data.comment, data.authors);
return data;
}
// ref: https://info.arxiv.org/help/arxiv_identifier.html
// e.g. (new id format: 2404.16782) | (old id format: hep-th/0702063)
parseArXivId = (str) => str.match(/(\d+\.\d+$)|((\w|-)+\/\d+$)/)?.[0];

setFormContents(paperTitle, abst, comment, authors) {
document.getElementById('js-title').value = paperTitle;
document.getElementById('js-abst').value = abst;
Expand All @@ -118,87 +118,6 @@ class UI {
});
}

async getArXivInfo(url) {
this.showProgressBar();
const paperId = this.parseArXivId(url);

const res = await fetch(ARXIV_API + '?id_list=' + paperId.toString());
if (res.status != 200) {
console.error('arXiv API request failed');
return;
}
const data = await res.text(); // TODO: error handling
console.log(res.status);
const xmlData = new window.DOMParser().parseFromString(data, 'text/xml');
console.log(xmlData);

const entry = xmlData.querySelector('entry');
const id = this.parseArXivId(entry.querySelector('id')?.textContent);
const paperTitle = entry.querySelector('title').textContent;
const abst = entry.querySelector('summary').textContent;
const authors = Array.from(entry.querySelectorAll('author')).map(
(author) => {
return author.textContent.trim();
}
);
const published = entry.querySelector('published').textContent;
const comment = entry.querySelector('comment')?.textContent ?? 'none';
this.setFormContents(paperTitle, abst, comment, authors);
return {
id: id,
title: paperTitle,
abst: abst,
authors: authors,
url: url,
published: published,
comment: comment,
publisher: 'arXiv',
};
}

async getOpenReviewInfo(url) {
this.showProgressBar();
const id = new URLSearchParams(new URL(url).search).get('id');

const res = await fetch(url);
const html = await res.text();
const parser = new DOMParser();
const xml = parser.parseFromString(html, 'text/html');

const authorsArray = Array.from(
xml.querySelectorAll('meta[name="citation_author"]'),
(author) => author.getAttribute('content')
);
const authors = authorsArray.length ? authorsArray : ['Anonymous'];

const paperTitle = xml
.querySelector('meta[name="citation_title"]')
.getAttribute('content');

const abst = xml
.querySelector('meta[name="citation_abstract"]')
.getAttribute('content');

const date = xml
.querySelector('meta[name="citation_publication_date"]')
.getAttribute('content');
// -> ISO 8601 date string
const published = new Date(date).toISOString().split('T')[0];
const comment = 'none';

this.setFormContents(paperTitle, abst, comment, authors);
return {
id: id,
title: paperTitle,
abst: abst,
authors: authors,
url: url,
published: published,
comment: comment,
publisher: 'OpenReview',
};
}

renderMessage(type, message, overwrite = false) {
// type: warning, danger, success, primary
const template = `<div class="uk-alert-{{type}}" uk-alert><a class="uk-alert-close" uk-close></a><p>{{message}}</p></div>`;
Expand Down

0 comments on commit aed8763

Please sign in to comment.