-
Notifications
You must be signed in to change notification settings - Fork 0
/
process-scraped-data.js
51 lines (40 loc) · 1.65 KB
/
process-scraped-data.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
var fs = require('fs');
var SERVICED_QUERIES_FILE = './serviced-queries/serviced-queries-final-040418.json';
var NEWSPAPERS_SCRAPED_DATA_FILE = './newspapers-scraped-data.json';
var servicedQueries = JSON.parse(fs.readFileSync(SERVICED_QUERIES_FILE, 'utf8'));
// Maps from newspaper id to (webpage title, search json object)
var nIdToWebpages = {};
var newspapers = JSON.parse(fs.readFileSync('./newspapers.json', 'utf8'));
newspapers.forEach(function (newspaper) {
nIdToWebpages[newspaper.uid] = {};
});
Object.keys(servicedQueries).forEach(function (newspaperId) {
Object.keys(servicedQueries[newspaperId]).forEach(function (q) {
Object.keys(servicedQueries[newspaperId][q]).forEach(function (year) {
servicedQueries[newspaperId][q][year].forEach(function (resBody) {
if (!('items' in resBody)) {
return;
}
resBody['items'].forEach(function (rawWebpageMeta) {
var rawYear = parseInt(year);
if (rawWebpageMeta.title in nIdToWebpages[newspaperId]) {
var webpageMeta = nIdToWebpages[newspaperId][rawWebpageMeta.title];
if (rawYear < parseInt(webpageMeta.year)) {
webpageMeta.year = rawYear;
}
} else {
webpageMeta = {
year: rawYear,
title: rawWebpageMeta.title,
url: rawWebpageMeta.link,
snippet: rawWebpageMeta.snippet
}
nIdToWebpages[newspaperId][rawWebpageMeta.title] = webpageMeta;
}
webpageMeta[q] = true;
});
});
});
});
});
fs.writeFileSync(NEWSPAPERS_SCRAPED_DATA_FILE, JSON.stringify(nIdToWebpages), 'utf8');