-
Notifications
You must be signed in to change notification settings - Fork 0
/
gather-webpages.js
80 lines (66 loc) · 2.19 KB
/
gather-webpages.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
// PARALLEL MODE
var async = require('async');
var fs = require('fs');
var path = require('path');
var _ = require('lodash');
var parse = require('csv-parse/lib/sync');
var webpagesCsv = fs.readFileSync('./webpages-data-view-deduped.txt', 'utf8');
var request = require('requestretry');
var NORMALIZED_URL_TO_FILE_ID_FILE = './normalized-url-to-file-id.json';
var WEBPAGES_PREFIX = './webpages';
var webpageRequest = request.defaults({
maxAttempts: 3,
retryDelay: 5000,
retryStrategy: function(err, res, body) {
var retryRequest = err || request.RetryStrategies.HTTPOrNetworkError(err, res, body);
return retryRequest;
},
strictSSL: false
});
var webpages = parse(webpagesCsv, {
columns: true,
delimiter: '\t'
});
// TODO: remove
webpages.forEach((e, i) => {
e.fileId = i;
});
var normalizedUrlToFileId;
if (fs.existsSync(NORMALIZED_URL_TO_FILE_ID_FILE)) {
normalizedUrlToFileId = JSON.parse(fs.readFileSync(NORMALIZED_URL_TO_FILE_ID_FILE, 'utf8'));
} else {
normalizedUrlToFileId = {
// Special key value
// TODO: use `nextAvailableFileId`
nextAvailableFileId: 0
};
}
function processWebpage(webpage, cb) {
if (fs.existsSync(path.join(WEBPAGES_PREFIX, webpage.fileId + '.output'))) {
// if (_.has(normalizedUrlToFileId, webpage['normalized url'])) {
console.log(`Already visited ${webpage.url}`);
return process.nextTick(cb);
}
webpageRequest({
url: webpage.url.replace('tto.tuoitre.vn', 'thethao.tuoitre.vn')
}, function (err, res, body) {
if (err) {
console.log(err);
}
fs.writeFileSync(path.join(WEBPAGES_PREFIX, webpage.fileId + '.output'), err || res.body, 'utf8');
// normalizedUrlToFileId[webpage['normalized url']] = webpage.fileId;
// fs.writeFileSync(NORMALIZED_URL_TO_FILE_ID_FILE, JSON.stringify(normalizedUrlToFileId), 'utf8');
console.log(`Visited ${webpage.url}`);
cb();
});
}
async.each(webpages, processWebpage, function (err) {
if (err) {
console.log('Error during op');
console.log(err);
return;
}
normalizedUrlToFileId.nextAvailableFileId = webpages.length;
fs.writeFileSync(NORMALIZED_URL_TO_FILE_ID_FILE, JSON.stringify(normalizedUrlToFileId), 'utf8');
console.log('Success');
});