Skip to content

Commit

Permalink
Use TS for scripting and building
Browse files Browse the repository at this point in the history
  • Loading branch information
ijemmao committed Aug 9, 2024
1 parent 250941d commit 2e92c7c
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 44 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
node_modules/
*.log
index.js
dist/
*.log
11 changes: 6 additions & 5 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@
"name": "webscraper",
"version": "1.0.0",
"description": "scrapes the bbc igbo",
"main": "webscraper.js",
"scripts": {
"dev": "nodemon --exec 'babel-node webscraper.js'",
"build": "./node_modules/.bin/babel webscraper.js --out-file index.js",
"start": "node index.js",
"dev": "nodemon -e ts",
"build": "tsc",
"start": "node dist/webscraper.js",
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
Expand All @@ -18,12 +17,14 @@
"axios": "^0.21.1",
"babel-node": "^0.0.1-security",
"cheerio": "^1.0.0-rc.5",
"lodash": "^4.17.21"
"lodash": "^4.17.21",
"typescript": "^5.5.4"
},
"devDependencies": {
"@babel/plugin-proposal-optional-chaining": "^7.12.7",
"@babel/plugin-transform-runtime": "^7.12.10",
"@babel/preset-env": "^7.12.11",
"@types/node": "^22.1.0",
"nodemon": "^2.0.7"
}
}
10 changes: 10 additions & 0 deletions tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"compilerOptions": {
"target": "es2015",
"module": "CommonJS",
"sourceMap": false,
"outDir": "dist",
"resolveJsonModule": true,
"esModuleInterop": true
}
}
84 changes: 47 additions & 37 deletions webscraper.js → webscraper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,41 +8,48 @@ const bbcIgbo = 'https://bbc.com/igbo';
const articleBases = ['/igbo/afirika-', '/igbo/articles/'];
const MAX_DEPTH = 2;

const finalObject = {}
const finalObject = {};

var dir = './articles';

if (!fs.existsSync(dir)){
fs.mkdirSync(dir);
if (!fs.existsSync(dir)) {
fs.mkdirSync(dir);
}

const scrapeDataAndWriteFile = ({ link, data, depth }) => {
const $ = cheerio.load(data);
const docName = split(link, '/')[2].split('-')[1] || last(split(link, '/'));
finalObject[docName] = { published: '', sentences: [] };
finalObject[docName].published = $('time').first().text();
finalObject[docName].sentences = reduce($('p, li').map((_, text) => {
const finalText = $(text).text().replace(/^([0-9:\.])+/g, '');
return finalText;
}
), (listOfSentences, sentence) => {
listOfSentences.push(trim(sentence));
return listOfSentences;
}, []);
finalObject[docName].sentences = reduce(
$('p, li').map((_, text) => {
const finalText = $(text)
.text()
.replace(/^([0-9:\.])+/g, '');
return finalText;
}),
(listOfSentences, sentence) => {
listOfSentences.push(trim(sentence));
return listOfSentences;
},
[]
);
const filePath = `articles/${docName}.json`;
// Writes to file if it doesn't exist
if (!fs.existsSync(filePath)) {
fs.writeFileSync(filePath, JSON.stringify(finalObject[docName], null, 2))
fs.writeFileSync(filePath, JSON.stringify(finalObject[docName], null, 2));
console.log(`Successfully wrote ${docName}`);
}
return scrapeContent({ startLink: `${bbc}${link}`, depth: depth + 1 });
}
};

const collectValidArticleLinks = ({ data }) => {
const $ = cheerio.load(data);
return filter($('a').map((_, anchor) => (
$(anchor).attr('href')
)), (anchorLink) => articleBases.some((articleBase) => anchorLink.startsWith(articleBase)));
return filter(
$('a').map((_, anchor) => $(anchor).attr('href')),
(anchorLink) =>
articleBases.some((articleBase) => anchorLink.startsWith(articleBase))
);
};

const visitStartLink = ({ link, data, depth }) => {
Expand All @@ -60,27 +67,30 @@ const scrapeContent = ({ startLink, depth }) => {
return;
}

return axios.get(startLink)
return axios
.get(startLink)
.then(({ data }) => visitStartLink({ link: startLink, data, depth }))
.then((articleLinks) => {
Promise.all(map(articleLinks, (link) => {
try {
return axios.get(`${bbc}${link}`)
.then(({ data }) => scrapeDataAndWriteFile({ link, data, depth }))
.catch((err) => {
console.log('Caught error in .then():', err.message);
process.exit(1);
});
} catch (err) {
console.log('Caught error:', err.message);
process.exit(1);
}
}))
.then(() => {
console.log(finalObject);
process.exit(0);
});
});
}
Promise.all(
map(articleLinks, (link) => {
try {
return axios
.get(`${bbc}${link}`)
.then(({ data }) => scrapeDataAndWriteFile({ link, data, depth }))
.catch((err) => {
console.log('Caught error in .then():', err.message);
process.exit(1);
});
} catch (err) {
console.log('Caught error:', err.message);
process.exit(1);
}
})
).then(() => {
// console.log(finalObject);
process.exit(0);
});
});
};

scrapeContent({ startLink: bbcIgbo, depth: 0 });
scrapeContent({ startLink: bbcIgbo, depth: 0 });
17 changes: 17 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -893,6 +893,13 @@
dependencies:
defer-to-connect "^1.0.1"

"@types/node@^22.1.0":
version "22.1.0"
resolved "https://registry.yarnpkg.com/@types/node/-/node-22.1.0.tgz#6d6adc648b5e03f0e83c78dc788c2b037d0ad94b"
integrity sha512-AOmuRF0R2/5j1knA3c6G3HOk523Ga+l+ZXltX8SF1+5oqcXijjfTd8fY3XRZqSihEu9XhtQnKYLmkFaoxgsJHw==
dependencies:
undici-types "~6.13.0"

abbrev@1:
version "1.1.1"
resolved "https://registry.yarnpkg.com/abbrev/-/abbrev-1.1.1.tgz#f8f2c887ad10bf67f634f005b6987fed3179aac8"
Expand Down Expand Up @@ -2934,13 +2941,23 @@ typedarray-to-buffer@^3.1.5:
dependencies:
is-typedarray "^1.0.0"

typescript@^5.5.4:
version "5.5.4"
resolved "https://registry.yarnpkg.com/typescript/-/typescript-5.5.4.tgz#d9852d6c82bad2d2eda4fd74a5762a8f5909e9ba"
integrity sha512-Mtq29sKDAEYP7aljRgtPOpTvOfbwRWlS6dPRzwjdE+C0R4brX/GUyhHSecbHMFLNBLcJIPt9nl9yG5TZ1weH+Q==

undefsafe@^2.0.3:
version "2.0.3"
resolved "https://registry.yarnpkg.com/undefsafe/-/undefsafe-2.0.3.tgz#6b166e7094ad46313b2202da7ecc2cd7cc6e7aae"
integrity sha512-nrXZwwXrD/T/JXeygJqdCO6NZZ1L66HrxM/Z7mIq2oPanoN0F1nLx3lwJMu6AwJY69hdixaFQOuoYsMjE5/C2A==
dependencies:
debug "^2.2.0"

undici-types@~6.13.0:
version "6.13.0"
resolved "https://registry.yarnpkg.com/undici-types/-/undici-types-6.13.0.tgz#e3e79220ab8c81ed1496b5812471afd7cf075ea5"
integrity sha512-xtFJHudx8S2DSoujjMd1WeWvn7KKWFRESZTMeL1RptAYERu29D6jphMjjY+vn96jvN3kVPDNxU/E13VTaXj6jg==

unicode-canonical-property-names-ecmascript@^1.0.4:
version "1.0.4"
resolved "https://registry.yarnpkg.com/unicode-canonical-property-names-ecmascript/-/unicode-canonical-property-names-ecmascript-1.0.4.tgz#2619800c4c825800efdd8343af7dd9933cbe2818"
Expand Down

0 comments on commit 2e92c7c

Please sign in to comment.