Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move to a custom substitutions file format #61

Merged
merged 1 commit into from
Jan 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,6 @@ You can see all the chosen character-name titles in the [`book-data/`](./book-da

## Text fixups

This project makes a lot of fixups to the original text, mostly around typos, punctuation, capitalization, and consistency. You can get a more specific idea of what these are via the code; there's [`convert-worker.js`](https://github.com/domenic/worm-scraper/blob/master/lib/convert-worker.js), where some things are handled generally, and [`substitutions.json`](https://github.com/domenic/worm-scraper/blob/master/lib/substitutions.json), for one-off fixes.
This project makes a lot of fixups to the original text, mostly around typos, punctuation, capitalization, and consistency. You can get a more specific idea of what these are via the code: there's [`convert-worker.js`](lib/convert-worker.js), where some things are handled generally, and the [`substitutions/` directory](./substitutions/), for one-off fixes.

This process is designed to be extensible, so if you notice any problems with the original text that you think should be fixed, file an issue to let me know, and we can update the fixup code so that the resulting ebook is improved. (Or better yet, send a pull request!)
55 changes: 30 additions & 25 deletions lib/convert-worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,19 @@
const workerpool = require("workerpool");
const fs = require("fs");
const { JSDOM } = require("jsdom");
const substitutions = require("./substitutions.json");

workerpool.worker({ convertChapter });

function convertChapter(chapter, bookTitle, inputPath, outputPath) {
function convertChapter(chapter, bookTitle, inputPath, outputPath, chapterSubstitutions) {
const contents = fs.readFileSync(inputPath, { encoding: "utf-8" });

const rawChapterJSDOM = new JSDOM(contents);
const { output, warnings } = getChapterString(chapter, bookTitle, rawChapterJSDOM.window.document);
const { output, warnings } = getChapterString(
chapter,
bookTitle,
chapterSubstitutions,
rawChapterJSDOM.window.document
);

// TODO: this should probably not be necessary... jsdom bug I guess!?
rawChapterJSDOM.window.close();
Expand All @@ -19,9 +23,9 @@ function convertChapter(chapter, bookTitle, inputPath, outputPath) {
return warnings;
}

function getChapterString(chapter, bookTitle, rawChapterDoc) {
function getChapterString(chapter, bookTitle, chapterSubstitutions, rawChapterDoc) {
const { xml, warnings } =
getBodyXML(chapter, bookTitle, rawChapterDoc.querySelector(".entry-content"));
getBodyXML(chapter, bookTitle, chapterSubstitutions, rawChapterDoc.querySelector(".entry-content"));

const output = `<?xml version="1.0" encoding="utf-8" ?>
<!DOCTYPE html>
Expand All @@ -40,7 +44,7 @@ ${xml}
return { output, warnings };
}

function getBodyXML(chapter, bookTitle, contentEl) {
function getBodyXML(chapter, bookTitle, chapterSubstitutions, contentEl) {
const warnings = [];

// Remove initial Next Chapter and Previous Chapter <p>
Expand Down Expand Up @@ -287,21 +291,21 @@ function getBodyXML(chapter, bookTitle, contentEl) {
xml = fixParahumansOnline(xml, bookTitle);

// One-off fixes
for (const substitution of substitutions[chapter.url] || []) {
for (const substitution of chapterSubstitutions) {
if (substitution.before) {
const indexOf = xml.indexOf(substitution.before);
if (indexOf === -1) {
warnings.push(`Could not find text "${substitution.before}" in ${chapter.url}. The chapter may have been ` +
`updated at the source, in which case, you should edit substitutions.json.`);
`updated at the source, in which case, you should edit the substitutions file.`);
}
if (indexOf !== xml.lastIndexOf(substitution.before)) {
warnings.push(`The text "${substitution.before}" occurred twice, and so the substitution was ambiguous. ` +
`Update substitutions.json for a more precise substitution.`);
`Update the substitutions file for a more precise substitution.`);
}

xml = xml.replace(new RegExp(escapeRegExp(substitution.before), "u"), substitution.after);
} else if (substitution.regExp) {
xml = xml.replace(new RegExp(substitution.regExp, "ug"), substitution.replacement);
xml = xml.replace(substitution.regExp, substitution.replacement);
} else {
warnings.push(`Invalid substitution specified for ${chapter.url}`);
}
Expand Down Expand Up @@ -336,7 +340,7 @@ function fixTruncatedWords(xml) {
xml = xml.replace(/[‘’][Cc]age(?![a-z])/ug, "’Cage");

// We can't do "’Clear" (short for Crystalclear) here because it appears too much as a normal word preceded by an
// open quote, so we do that in substitutions.json.
// open quote, so we do that in the substitutions file.

return xml;
}
Expand All @@ -355,9 +359,9 @@ function fixDialogueTags(xml) {
//
// This sometimes overcorrects, as in the following example:
// > “Basically,” Alec said, “For your powers to manifest, ...
// Here instead we should lowercase the "f". We handle that via one-offs in substitutions.json.
// Here instead we should lowercase the "f". We handle that via one-offs in the substitutions file.
//
// This applies to ~800 instances, so although we have to correct back in substitutions.json a decent number of
// This applies to ~800 instances, so although we have to correct back in the substitutions file a decent number of
// times, it definitely pays for itself. Most of the instances we have to correct back we also need to fix the
// capitalization anyway, and that's harder to do automatically, since proper names/"I"/etc. stay capitalized.
xml = xml.replace(/,” ([A-Za-z]+ [A-Za-z]+), “([A-Z])/ug, ",” $1. “$2");
Expand Down Expand Up @@ -535,8 +539,8 @@ function fixCapitalization(xml, bookTitle) {
/patrol (block|group|leader|guard|student|uniform|squad|soldier|officer|crew|girl|bus|training)/uig,
(_, $1) => `Patrol ${$1.toLowerCase()}`
);
// This usually works in Ward (some instances corrected back in substitutions.json), and has a few false positives in
// Worm, where it is never needed:
// This usually works in Ward (some instances corrected back in the substitutions file), and has a few false positives
// in Worm, where it is never needed:
if (bookTitle === "Ward") {
xml = xml.replace(/the patrol(?!s|ling)/ug, "the Patrol");
}
Expand Down Expand Up @@ -572,13 +576,14 @@ function fixCapitalization(xml, bookTitle) {
xml = xml.replace(/(?<! {2}|“|>)Flock/ug, "flock");

// Especially early in Worm, PRT designations are capitalized; they should not be. This fixes the cases where we
// can be reasonably sure they don't start a sentence, although more specific instances are done in
// substitutions.json, and some need to be back-corrected.
// can be reasonably sure they don't start a sentence, although more specific instances are done in the substitutions
// file, and some need to be back-corrected.
//
// Note: "Master" is specifically omitted because it fails poorly on Worm Interlude 4. Other instances need to be
// corrected via substitutions.json.
// corrected via the substitutions file.
//
// This also over-de-capitalizes "The Stranger" in Ward (a titan name). Those also get fixed in substitutions.json.
// This also over-de-capitalizes "The Stranger" in Ward (a titan name). Those also get fixed in the substitutions
// file.
xml = xml.replace(
// eslint-disable-next-line max-len
/(?<! {2}|“|>|\n|: )(Mover|Shaker|Brute|Breaker|Tinker|Blaster|Thinker|Striker|Changer|Trump|Stranger|Shifter|Shaper)(?! [A-Z])/ug,
Expand Down Expand Up @@ -615,7 +620,7 @@ function fixCapitalization(xml, bookTitle) {

// "Mom" and "Dad" should be capitalized when used as a proper name. These regexps are tuned to catch a good amount of
// instances, without over-correcting for non-proper-name-like cases. Many other instances are handled in
// substitutions.json.
// the substitutions file.
xml = xml.replace(/(?<!mom), dad(?![a-z])/ug, ", Dad");
xml = xml.replace(/, mom(?![a-z-])/ug, ", Mom");

Expand All @@ -635,8 +640,8 @@ function fixCapitalization(xml, bookTitle) {
xml = xml.replace(/ Neo-/ug, " neo-");

// Style guides disagree on whether items like "english muffin", "french toast", and "french kiss" need their
// adjective capitalized. The books mostly use lowercase, so let's stick with that. (substitutions.json corrects one
// case of "French toast".)
// adjective capitalized. The books mostly use lowercase, so let's stick with that. (The substitutions file corrects
// one case of "French toast".)
xml = xml.replace(/english(?! muffin)/ug, "English");
xml = xml.replace(/(?<! {2})English muffin/ug, "english muffin");

Expand All @@ -652,7 +657,7 @@ function fixCapitalization(xml, bookTitle) {
// All plural discussions of "Titans" are after Sundown 17.y.
xml = xml.replace(/titans/ug, "Titans");

// Since we can't safely change all instances of "titan", most are in substitutions.json. We can do a few here,
// Since we can't safely change all instances of "titan", most are in the substitutions file. We can do a few here,
// though.
xml = xml.replace(/dauntless titan/uig, "Dauntless Titan"); // Sometimes "Dauntless" isn't even capitalized.
xml = xml.replace(/Kronos titan/ug, "Kronos Titan");
Expand Down Expand Up @@ -725,7 +730,7 @@ function fixHyphens(xml) {
// Preemptive(ly) is often hyphenated (not always). It should not be.
xml = xml.replace(/([Pp])re-emptive/ug, "$1reemptive");

// These should be hyphenated only when used as a verb. We correct those cases back in substitutions.json.
// These should be hyphenated only when used as a verb. We correct those cases back in the substitutions file.
xml = xml.replace(/fist-bump/ug, "fist bump");
xml = xml.replace(/high-five/ug, "high five");

Expand Down Expand Up @@ -762,7 +767,7 @@ function standardizeSpellings(xml) {
xml = xml.replace(/(\b)tv(\b)/ug, "$1TV$2");
xml = xml.replace(/t\.v\./uig, "TV");

// "okay" is preferred to "ok" or "o.k.". This sometimes gets changed back via substitutions.json when people are
// "okay" is preferred to "ok" or "o.k.". This sometimes gets changed back via the substitutions file when people are
// writing notes and thus probably the intention was to be less formal. Also it seems per
// https://en.wikipedia.org/wiki/A-ok the "A" in "A-okay" should be capitalized.
xml = xml.replace(/Ok([,. ])/ug, "Okay$1");
Expand Down
143 changes: 142 additions & 1 deletion lib/convert.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ module.exports = async (
chapterDataPath,
contentPath,
bookData,
substitutionsPath,
concurrentJobs,
chapterTitleStyle
) => {
Expand All @@ -19,6 +20,9 @@ module.exports = async (
await fs.writeFile(chapterDataPath, JSON.stringify(chapterData, null, 2));
const flattenedChapters = chapterData.flatMap(arc => arc.chapters);

const substitutionsText = await fs.readFile(substitutionsPath, { encoding: "utf-8" });
const substitutions = parseSubstitutions(substitutionsText);

console.log("Converting raw downloaded HTML to EPUB chapters");
const progress = progressUtils.start(flattenedChapters.length);

Expand All @@ -32,8 +36,15 @@ module.exports = async (
await Promise.all(flattenedChapters.map(async chapter => {
const inputPath = path.resolve(cachePath, chapter.inputFilename);
const outputPath = path.resolve(contentPath, chapter.outputFilename);
const chapterSubstitutions = substitutions.get(chapter.url) || [];

warnings.push(...await pool.exec("convertChapter", [chapter, bookData.title, inputPath, outputPath]));
warnings.push(...await pool.exec("convertChapter", [
chapter,
bookData.title,
inputPath,
outputPath,
chapterSubstitutions
]));

progressUtils.increment(progress);
}));
Expand Down Expand Up @@ -91,3 +102,133 @@ function chooseChapterTitle(chapterData, chapterTitleStyle) {

throw new Error(`Invalid chapter title style: ${chapterTitleStyle}`);
}

function parseSubstitutions(text) {
const lines = text.split("\n");
const result = new Map();

let currentChapter = null;
let currentBefore = null;
let currentRegExp = null;

for (const [lineNumber, line] of Object.entries(lines)) {
// Skip empty lines
if (!line.trim()) {
continue;
}

const errorPrefix = `Error in substitutions line "${line}" (line number ${Number(lineNumber) + 1}): `;

let sigil, content;
try {
[, sigil, content] = /(@ | {2}- | {2}\+ ?| {2}r | {2}s | {2}# )(.*)/u.exec(line);
} catch {
throw new Error(`${errorPrefix}invalid line format`);
}

switch (sigil) {
// New chapter
case "@ ": {
if (!isCanonicalizedURL(content)) {
throw new Error(`${errorPrefix}invalid chapter URL`);
}

currentChapter = content;
if (!result.has(currentChapter)) {
result.set(currentChapter, []);
}
currentBefore = null;
currentRegExp = null;

break;
}

// Before line
case " - ": {
if (!currentChapter) {
throw new Error(`${errorPrefix}missing previous current chapter (@) line`);
}
if (currentBefore) {
throw new Error(`${errorPrefix}appeared after a before (-) line`);
}
if (currentRegExp) {
throw new Error(`${errorPrefix}appeared after a regexp (r) line`);
}

currentBefore = content.replaceAll("\\n", "\n");
currentRegExp = null;

break;
}

// After line
case " +":
case " + ": {
if (!currentChapter || !currentBefore) {
throw new Error(`${errorPrefix}missing previous current chapter (@) or before (-) line`);
}
if (currentRegExp) {
throw new Error(`${errorPrefix}appeared after a regexp (r) line`);
}

const change = {
before: beforeAfterLineToString(currentBefore),
after: beforeAfterLineToString(content)
};
result.get(currentChapter).push(change);
currentBefore = null;

break;
}

// RegExp line
case " r ": {
if (!currentChapter) {
throw new Error(`${errorPrefix}missing previous current chapter (@) line`);
}
if (currentBefore) {
throw new Error(`${errorPrefix}appeared after a before (-) line`);
}

currentRegExp = new RegExp(content, "ug");

break;
}

// RegExp substitution
case " s ": {
if (!currentChapter || !currentRegExp) {
throw new Error(`${errorPrefix}missing previous current chapter (@) or regexp (r) line`);
}

const change = {
regExp: currentRegExp,
replacement: content.replaceAll("\\n", "\n")
};
result.get(currentChapter).push(change);
currentRegExp = null;

break;
}

// Comment
case " # ": {
if (!currentChapter) {
throw new Error(`${errorPrefix} missing previous current chapter (@) line`);
}

break;
}
}
}

return result;
}

function isCanonicalizedURL(urlString) {
return URL.parse(urlString).href === urlString;
}

function beforeAfterLineToString(line) {
return line.replaceAll("\\n", "\n").replace(/(?:\\s)+$/u, match => " ".repeat(match.length / 2));
}
Loading
Loading