-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.js
82 lines (66 loc) · 1.91 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import cheerio from 'cheerio';
import { constructDateUrl, getPromise } from './helpers';
const isSectionHeader = text =>
text.indexOf('B') > -1
&& text.indexOf('L') > -1
&& text.indexOf('M') > -1;
class Day {
constructor(body) {
this.body = cheerio.load(body, {
decodeEntities: true,
});
}
get rawSections() {
let sections = [];
this.body('.MsoNormal').each((i, sp) => sections.push(sp))
return sections;
}
get sections() {
let headings = {};
let sections = this.sectionHeadings(this.rawSections);
// console.log(this.rawSections)
return sections.reduce((col, sp) => ({
...col,
[this.sectionHeader(sp)]: this.section(sp)
}), {});
}
sectionHeader(sp) {
/** Identifies heading structure */
let secHeader = this.body(sp).find('u b font');
if (!secHeader.text())
secHeader = this.body(sp).find('span b u font');
let text = secHeader.text();
return text;
}
sectionHeadings(rawSections) {
return rawSections
.filter(sec => isSectionHeader(this.body(sec).text()))
.map(sec => this.body(sec).text());
}
section(sectionHeading) {
let secRes = {
links: [],
};
let headerP = this.body(header).parentsUntil('p');
while(headerP.next('p').text()) {
let secLinks = headerP.find('a');
if (secLinks.length) {
secLinks.each((idx, link) =>
this.body(link).attr('href').indexOf('pdf') > -1 &&
secRes.links.push({
text: this.body(link).text(),
href: this.body(link).attr('href'),
}));
}
headerP = headerP.next('p');
}
return secRes;
}
}
/** Scraper function */
const scrape = date => new Promise((res, rej) => {
getPromise(constructDateUrl(date))
.then((response, body) => res({ [date]: new Day(response).sections }))
.catch(err => console.error(err));
});
export default scrape;