-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathpitchfork.js
executable file
·185 lines (161 loc) · 5.08 KB
/
pitchfork.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
const { URL } = require('url');
const fetch = require('node-fetch');
const cheerio = require('cheerio');
const minimist = require('minimist');
const sqlite = require('sqlite');
const SQL = require('sql-template-strings');
const Promise = require('bluebird');
/* eslint-disable no-console */
const MAX_PAGES = 2000;
const MAX_SCORES_PER_PAGE = 25;
const parseReview = (db, $) => async (i, review) => {
const $r = $(review);
const title = $r.find('.review__title-album').text();
const artists = $r
.find('.review__title-artist li')
.map((_, node) => $(node).text())
.get();
const genres = $r
.find('.genre-list__link')
.map((_, node) => $(node).text())
.get();
const date = $r.find('.pub-date').attr('datetime');
const image = $r.find('.review__artwork img').attr('src');
const href = $r.find('.review__link').attr('href');
const url = new URL(href, 'https://pitchfork.com').href;
return db
.run(
SQL`INSERT OR IGNORE INTO albums (name, date, image, url) VALUES (${title}, ${date}, ${image}, ${url})`
)
.then(async ({ stmt }) => {
let albumId;
if (stmt.changes === 0) {
albumId = await db
.get(SQL`SELECT album_id as id FROM albums WHERE name = ${title} AND url = ${url}`)
.then(({ id }) => id);
} else {
albumId = stmt.lastID;
}
const artistIds = await Promise.all(
artists.map(artist =>
db
.run(SQL`INSERT OR IGNORE INTO artists (name) VALUES (${artist})`)
.then(({ stmt: st }) => {
if (st.changes === 0) {
return db
.get(SQL`SELECT artist_id as id FROM artists WHERE name = ${artist}`)
.then(({ id }) => id);
}
return st.lastID;
})
)
);
console.log(albumId, title, artists, artistIds);
await Promise.all(
artistIds.map(artistId =>
db
.run(
SQL`INSERT OR IGNORE INTO album_artists (album_id, artist_id) VALUES (${albumId}, ${artistId})`
)
.then(({ stmt: st }) => st.lastID)
)
);
const genreIds = await Promise.all(
genres.map(genre =>
db
.run(SQL`INSERT OR IGNORE INTO genres (name) VALUES (${genre})`)
.then(({ stmt: st }) => {
if (st.changes === 0) {
return db
.get(SQL`SELECT genre_id as id FROM genres WHERE name = ${genre}`)
.then(({ id }) => id);
}
return st.lastID;
})
)
);
await Promise.all(
genreIds.map(genreId =>
db
.run(
SQL`INSERT OR IGNORE INTO album_genres (album_id, genre_id) VALUES (${albumId}, ${genreId})`
)
.then(({ stmt: st }) => st.lastID)
)
);
return Promise.resolve();
});
};
const request = ({ db, base, maxPages, i = 1 }) => {
const url = `${base}?page=${i}`;
console.log('Fetching:', url);
return fetch(url)
.then(r => r.text())
.then(r => {
const $ = cheerio.load(r);
const iter = parseReview(db, $);
const reviews = $('.review');
if (reviews.length === 0) {
return Promise.resolve();
}
reviews.each(iter);
if (i < maxPages) {
return request({ db, base, maxPages, i: i + 1 });
}
return Promise.resolve();
});
};
const pageRows = async (db, rows) => {
const queries = [];
const promises = rows.splice(0, MAX_SCORES_PER_PAGE).map(row =>
fetch(row.url)
.then(r => r.text())
.then(review => {
const $ = cheerio.load(review);
const score = $('.score-box .score')
.eq(0)
.text();
queries.push(SQL`UPDATE albums SET score = ${score} WHERE album_id = ${row.album_id}`);
console.log('Will set', row.album_id, 'score to', score);
})
.catch(() => Promise.resolve())
);
await Promise.all(promises);
await Promise.all(queries.map(query => db.run(query)));
if (rows.length > 0) {
return pageRows(db, rows);
}
return Promise.resolve();
};
(async () => {
const argv = minimist(process.argv.slice(2));
let base;
let doRequest = true;
let offset = 0;
let limit = -1;
const maxPages = argv.pages || MAX_PAGES;
if (argv.reviews) {
base = 'https://pitchfork.com/reviews/albums/';
} else if (argv.best) {
base = 'https://pitchfork.com/reviews/best/albums/';
} else if (argv.scores) {
doRequest = false;
offset = argv.offset || 0;
limit = argv.limit || -1;
} else {
throw new Error('You must specify a flag to request data.');
}
const db = await sqlite.open('./database.sqlite', { Promise, cached: true });
if (argv.fresh) {
await db.migrate({ force: 'last' });
}
if (doRequest) {
await request({ db, base, maxPages });
}
const rows = await db.all(
SQL`SELECT album_id, url FROM albums ORDER BY album_id DESC LIMIT ${limit} OFFSET ${offset}`
);
await pageRows(db, rows);
const data = await db.all(SQL`SELECT * FROM albums LIMIT 10`);
console.log(data);
})();