-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
executable file
·251 lines (225 loc) · 8.04 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
#!/usr/bin/env node
// Config
const baseUrl = 'https://www.dischord.com/fugazi_live_series';
const csvFieldSeparator = ',';
const csvTextDelimiter = '"';
// Includes
const chalk = require('chalk');
const fs = require('fs');
const program = require('commander');
const puppeteer = require('puppeteer');
program
.on('--help', function() {
console.log('');
console.log(chalk.bold('Fugotcha is a command-line utility for scraping data from the Fugazi Live Series on Dischord.com.'));
})
.option('-o --output <file>',
'Required. The name of the file to which to output the scraped data.')
.option('-p --page <slug>',
'Required. The slug of the page to scrape (the URL after "fugazi_live_series").')
.option('-c --count [count]',
'Optional. The number of pages to scrape (default: 1); 0 to scrape until the end of the dataset.', 1)
.action(function () {
const slug = validatePage(program.page);
const pageLimit = validateCount(program.count);
const outputStream = createOutputStream(program.output);
process.stdout.write('Fugetting data..');
puppeteer.launch().then(async browser => {
const page = await browser.newPage();
await page.goto(`${baseUrl}/${slug}`).then(response => {
const status = response.headers().status;
if (status !== '200') {
console.error(chalk.red('Request of %s failed with a status of %s.'), page.url(), status);
process.exit(1);
}
});
/*
* Add headings to CSV.
*
* This is brittle, as it will need to be kept in sync with any changes
* made to the releaseData array below. However, CSV headings are useful,
* and all the CSV libraries I looked at were extremely opinionated about
* the CSVs having the same number of fields (to the point where fields
* would be dropped if they didn't match the heading), so for now:
*/
outputStream.write(prepareCsvRow([
'Page Slug',
'Release ID',
'Show Date',
'Venue',
'Door Price',
'Attendance',
'Recorded by',
'Mastered by',
'Original Source',
'Tracks =>'
]));
let morePagesToScrape, tracks, releaseId;
let i = 0;
do {
// This is an "economy" progress indicator. A bona fide progress bar
// isn't worth the trouble (or would be misleading) because there are
// lots of cases where it's not easy to determine how many pages are
// left to scrape (e.g., --count is set to 50 but there are only 2 items
// left in the dataset, or --count is set to 0 (i.e., all pages) but
// that number is unknown).
process.stdout.write('.');
let releaseData = [];
releaseData.push(page.url().split('/').pop());
releaseData.push(await extractReleaseId(page));
releaseData.push(await extractReleaseDetails('Show Date:', page));
releaseData.push(await extractReleaseDetails('Venue:', page));
releaseData.push(await extractReleaseDetails('Door Price:', page));
releaseData.push(await extractReleaseDetails('Attendance:', page));
releaseData.push(await extractReleaseDetails('Recorded by', page));
releaseData.push(await extractReleaseDetails('Mastered by', page));
releaseData.push(await extractReleaseDetails('Original Source:', page));
tracks = await extractTracks(page);
outputStream.write(prepareCsvRow(releaseData.concat(tracks)));
morePagesToScrape = (++i < pageLimit) || (pageLimit === 0);
if (morePagesToScrape) {
try {
await page.$eval('#nextButton a', async match => {
match.click();
});
} catch (e) {
console.log(chalk.bold('No "next" link; reached end of scrapable data.'));
morePagesToScrape = false;
}
}
} while (morePagesToScrape);
console.log(`\nDone. Data saved in ${program.output}.`);
outputStream.close();
await browser.close();
});
})
.parse(process.argv);
/**
* Returns the supplied string with the CSV delimiter escaped.
*/
function escapeCsvTextDelimiter(string) {
let regex = new RegExp(csvTextDelimiter);
let replacement = `${csvTextDelimiter}${csvTextDelimiter}`;
return string.replace(regex, replacement);
}
/**
* Extracts a release detail from the page.
*
* Release details are marked up in a definition list with no useful attributes
* for selection, so the definition term (i.e., the text within the tags) is all
* that can be used for retrieval.
*
* @param {String} label
* The label of the "release detail" of interest, e.g., "Show Date:"
* @param {Page} page
* @see https://github.com/GoogleChrome/puppeteer/blob/v1.10.0/docs/api.md#class-page
* @return {Promise<String>}
* Resolves to the value of the "release detail" of interest, e.g., "1993-02-13"
*/
function extractReleaseDetails(label, page) {
return page.waitForSelector('dl.release-details').then(async elementHandle => {
const dd = (await elementHandle.$x(`//dt[text()="${label}"]/following-sibling::dd`))[0];
return (typeof dd === 'undefined') ? '' : await page.evaluate(el => {
return el.textContent.trim();
}, dd);
});
}
/**
* Extracts the release ID from a page.
*
* @param {Page} page
* @see https://github.com/GoogleChrome/puppeteer/blob/v1.10.0/docs/api.md#class-page
* @return {Promise<String>}
* Promise which resolves to the release ID.
*/
function extractReleaseId(page) {
const productSelector = '#productInfo';
const releaseSelector = '.releaseNumber';
return page.waitForSelector(productSelector).then(async elementHandle => {
return await elementHandle.$eval(releaseSelector, match => {
return match.innerText.replace('Fugazi Live Series', '').trim();
});
});
}
/**
* Extracts track titles from a page.
*
* @param {Page} page
* @see https://github.com/GoogleChrome/puppeteer/blob/v1.10.0/docs/api.md#class-page
* @return {Promise<Array[String]>}
* Promise which resolves to an Array of track titles.
*/
function extractTracks(page) {
const trackListSelector = '.mp3_list';
const trackSelector = '.track_name';
return page.waitForSelector(trackListSelector).then(async elementHandle => {
return await elementHandle.$$eval(trackSelector, matches => {
return matches.map(track => {
return track.innerText.trim();
});
});
});
}
/**
* Creates a writeStream from the provided filename.
*
* @param {String} value
* @return {<fs.WriteStream>}
*/
function createOutputStream(filename) {
if (typeof(filename) === 'undefined') {
console.error(chalk.red('Output is a required parameter.'));
process.exit(1);
}
return fs.createWriteStream(filename, {flags: 'wx'}).on('error', e => {
switch (e.code) {
case 'EEXIST':
console.error(chalk.red(`File ${filename} already exists; aborting so as not to overwrite.`));
break;
default:
console.error(chalk.red(e.message));
}
process.exit(1);
});
}
/**
* Formats data for CSV.
*
* @param {Array} data
* The data which is to be written to CSV.
* @return {String}
* A string, complete with line feed, which can be inserted into a CSV file.
*
*/
function prepareCsvRow(data = []) {
let row = data.map(datum => csvTextDelimiter + escapeCsvTextDelimiter(datum) + csvTextDelimiter);
return row.join(csvFieldSeparator) + "\n";
}
/**
* Returns a validated page slug or aborts the program.
*
* @param {String} value
* @return {String}
*/
function validatePage(value) {
if (typeof(value) === 'undefined') {
console.error(chalk.red('Page is a required parameter.'));
process.exit(1);
}
// in case the whole path is specified, just get the last bit
return value.replace('fugazi_live_series/', '');
}
/**
* Returns a validated count or aborts the program.
*
* @param {String} value
* @return {Number}
*/
function validateCount(value) {
const limit = Number(value);
if (Number.isInteger(limit) && limit >= 0) {
return limit;
}
console.error(chalk.red('Count must be a non-negative integer.'));
process.exit(1);
}