Skip to content

Commit

Permalink
Refactor analyzer, get most used keywords
Browse files Browse the repository at this point in the history
  • Loading branch information
renoirb committed Nov 6, 2017
1 parent 334f03b commit 29aff30
Show file tree
Hide file tree
Showing 5 changed files with 210 additions and 118 deletions.
116 changes: 116 additions & 0 deletions src/analyze.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
'use strict';

import * as fsa from 'async-file';

import {
readLines,
handleIndexSourceErrors,
readCached,
figureOutTruncateAndSelector,
cheerioLoad
} from './common';

import {stopWords} from './stopwords';

const URL_LIST = 'archive/index.csv';
const OVERWRITE = true;

function removePunctuation(input) {
return input.replace(/[^\w\s]|_/g, '');
}

async function extractWords(recv, source) {
const loaded = cheerioLoad(recv);
return loaded.then(shard => {
const {_, truncate} = figureOutTruncateAndSelector(source);
shard(truncate).remove();
const text = shard.text().split(' ');
const words = Object.create(null);
const foundOnce = new Set();
for (let i = 0; i < text.length; i++) {
const w = removePunctuation(text[i]).toLowerCase();
if (/^[a-zA-ZÀ-ÖØ-öø-ÿ]+$/.test(w) && stopWords.has(w) === false) {
if (foundOnce.has(w)) {
if (Object.prototype.hasOwnProperty.call(words, w)) {
words[w]++;
} else {
words[w] = 2;
}
} else {
foundOnce.add(w);
}
}
}
return words;
});
}

async function read(source) {
const path = `archive/${source.slug}`;
const cache = `${path}/cache.html`;
const targetFileName = `${path}/analyze.json`;
const cacheExists = await fsa.exists(cache);
const data = {};
if (cacheExists === true) {
const cached = await readCached(cache);
const words = await extractWords(cached, source);
data.words = words;
}

return {file: targetFileName, data};
}

function sort(subject) {
let sortable = [];
for (let key in subject) {
sortable.push([key, subject[key]]);
}
// Sort from more occurences, to least
sortable.sort((a, b) => {
return -1 * (a[1] - b[1]);
});

return sortable; // array in format [ [ key1, val1 ], [ key2, val2 ], ... ]
}

async function analyze(recv) {
const words = recv.data.words;
const keywords = Object.create(null);
const sorted = sort(words);
const max = 10;
let iter = 0;
for (let popular of sorted) {
let used = popular[1]; // word has been used n times
let word = popular[0];
if (iter <= max && used > 3) {
keywords[word] = used;
}
iter++;
}

recv.data.keywords = keywords;

return recv;
}

async function write({file, data = {}}, boolOverwrite = false) {
const destExists = await fsa.exists(file);
if (destExists === false || (destExists === true && boolOverwrite)) {
await fsa.writeTextFile(file, JSON.stringify(data), 'utf8');
}

return {file, data};
}

/**
* Something is going somewhat as an anti-pattern here.
* We want Promise.all(...) at each step, and it's not how
* it is as of now. Needs rework here. TODO
*/
for (const url of readLines(URL_LIST)) {
Promise.resolve(url)
.then(u => read(u))
.then(descriptor => analyze(descriptor))
.then(descriptor => write(descriptor, OVERWRITE))
.catch(handleIndexSourceErrors);
}
3 changes: 2 additions & 1 deletion src/archive.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ const [...urls] = readLines(URL_LIST);

/**
* Something is going somewhat as an anti-pattern here.
* Gotta wire generator and async/await TODO
* We want Promise.all(...) at each step, and it's not how
* it is as of now. Needs rework here. TODO
*/
Promise.all(urls)
.then(u => fetcher(u))
Expand Down
28 changes: 27 additions & 1 deletion src/common.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import fs from 'fs';
import pathutil from 'path';
import cheerio from 'cheerio';
import * as fsa from 'async-file';
import lines from 'gen-readlines';
import slugifier from './normalizer/slugs';
Expand Down Expand Up @@ -75,10 +76,35 @@ function readCachedError(errorObj) {
}
}

// Make possible to do extractLinks, markdownify, ... in parallel TODO
async function cheerioLoad(recv, configObj = {}) {
return new Promise(resolve => resolve(cheerio.load(recv, configObj)));
}

/**
* Given every row in source file .csv
* http://example.org/a/b.html;selector;truncate
*
* selector is the CSS selector where the main content is
* truncate is a list of CSS selectors to strip off
*/
function figureOutTruncateAndSelector(sourceArgument) {
// If we know exactly where the main content is, otherwise grab the whole
// document body.
const selector = (sourceArgument.selector.length === 0) ? 'body' : `${sourceArgument.selector}`;
// Truncate is to strip off any patterns we do not want
// as part of our archived article.
let truncate = (sourceArgument.truncate.length === 0) ? '' : `${sourceArgument.truncate},`;
truncate += 'script,style,noscript';
return {selector, truncate};
}

export {
readCached,
readLines,
coroutine,
parseCsvLine,
handleIndexSourceErrors
handleIndexSourceErrors,
figureOutTruncateAndSelector,
cheerioLoad
};
6 changes: 5 additions & 1 deletion src/stopwords.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 29aff30

Please sign in to comment.