Skip to content

Commit

Permalink
Improve book metadata
Browse files Browse the repository at this point in the history
* Change the modified date from being the date that worm-scraper was run, to the most recent modified date of any chapter.

* Write the worm-scraper information as a "contributor" with role "bkp" ("book producer").

Closes #46.
  • Loading branch information
domenic committed Jan 4, 2025
1 parent 38a5ac1 commit 793a02c
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 12 deletions.
1 change: 1 addition & 0 deletions lib/convert.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ function getChapterData(arcs, manifest, chapterTitleStyle) {
chapter.originalTitle = manifestEntry.title;
chapter.usedTitle = chooseChapterTitle(chapter, chapterTitleStyle);
chapter.datePublished = manifestEntry.datePublished;
chapter.dateModified = manifestEntry.dateModified;
}
}

Expand Down
8 changes: 7 additions & 1 deletion lib/download.js
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,13 @@ async function downloadAllChapters(manifest, bookData, cachePath, manifestPath)
const { contents, dom } = await downloadChapter(chapterURL);
const title = getChapterTitle(dom.window.document);
const datePublished = getChapterDatePublished(dom.window.document);
const dateModified = getChapterDateModified(dom.window.document);

dom.window.close();

manifest[manifestIndex].title = title;
manifest[manifestIndex].datePublished = datePublished;
manifest[manifestIndex].dateModified = dateModified;
manifest[manifestIndex].filename = filename;
await fs.writeFile(path.resolve(cachePath, filename), contents);

Expand Down Expand Up @@ -105,7 +107,11 @@ function getChapterTitle(rawChapterDoc) {
}

function getChapterDatePublished(rawChapterDoc) {
return rawChapterDoc.querySelector(".entry-date").dateTime;
return rawChapterDoc.querySelector(`meta[property="article:published_time"]`).content;
}

function getChapterDateModified(rawChapterDoc) {
return rawChapterDoc.querySelector(`meta[property="article:modified_time"]`).content;
}

async function downloadChapter(url) {
Expand Down
40 changes: 29 additions & 11 deletions lib/scaffold.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
"use strict";
const fs = require("fs").promises;
const path = require("path");
const { name: packageName, version: packageVersion } = require("../package.json");

const BOOK_SERIES = "Parahumans";
const BOOK_PUBLISHER = "Domenic Denicola";
const BOOK_AUTHOR = "Wildbow";
const BOOK_GENERATOR = `${packageName} v${packageVersion}`;

const STYLES_FILENAME = "chapter.css";
const COVER_DOCUMENT_FILENAME = "cover.xhtml";
Expand All @@ -17,15 +20,15 @@ module.exports = async (
bookPath,
contentPath,
chaptersPath,
augmentedChapterDataPath,
chapterDataPath,
bookInfo
) => {
await Promise.all([
fs.cp(scaffoldingPath, bookPath, { recursive: true, filter: noThumbs }),
fs.cp(coverImagePath, path.resolve(bookPath, "OEBPS", COVER_IMAGE_FILENAME)),
getChapterInfo(contentPath, chaptersPath, augmentedChapterDataPath).then(info => {
getChapterInfo(contentPath, chaptersPath, chapterDataPath).then(info => {
return Promise.all([
writeOPF(contentPath, bookInfo, info.manifestAndSpineFiles, info.datePublished),
writeOPF(contentPath, bookInfo, info.manifestAndSpineFiles, info.datePublished, info.dateModified),
writeNav(contentPath, info.manifestAndSpineFiles, info.tocHTML),
writeArcTitlePages(chaptersPath, info.arcTitlePages)
]);
Expand All @@ -39,7 +42,7 @@ function noThumbs(filePath) {
return path.basename(filePath) !== "Thumbs.db";
}

function writeOPF(contentPath, bookInfo, manifestAndSpineFiles, datePublished) {
function writeOPF(contentPath, bookInfo, manifestAndSpineFiles, datePublished, dateModified) {
const manifestItems = manifestAndSpineFiles.map(f => {
return ` <item id="${f.id}" href="${f.href}" media-type="application/xhtml+xml"/>`;
}).join("\n");
Expand All @@ -48,8 +51,6 @@ function writeOPF(contentPath, bookInfo, manifestAndSpineFiles, datePublished) {
return ` <itemref idref="${f.id}"/>`;
}).join("\n");

const dateWithoutMilliseconds = `${(new Date()).toISOString().split(".")[0]}Z`;

// Note: per the spec at https://www.w3.org/TR/epub-33/#sec-group-position it seems like the collection-type should be
// "set", but Calibre only recognizes "series" as of now:
// https://github.com/kovidgoyal/calibre/blob/37dd0f5c70ebf8952d7be6dd7c37afd2a4fce9f0/src/calibre/ebooks/metadata/opf3.py#L792
Expand All @@ -65,16 +66,20 @@ function writeOPF(contentPath, bookInfo, manifestAndSpineFiles, datePublished) {
<dc:title id="title">${bookInfo.title}</dc:title>
<meta refines="#title" property="title-type">main</meta>
<meta property="belongs-to-collection" id="collection">Parahumans</meta>
<meta property="belongs-to-collection" id="collection">${BOOK_SERIES}</meta>
<meta refines="#collection" property="collection-type">series</meta>
<meta refines="#collection" property="group-position">${bookInfo.groupPosition}</meta>
<dc:creator id="creator">${BOOK_AUTHOR}</dc:creator>
<meta refines="#creator" property="role" scheme="marc:relators">aut</meta>
<dc:publisher>${BOOK_PUBLISHER}</dc:publisher>
<dc:date>${datePublished}</dc:date>
<meta property="dcterms:modified">${dateWithoutMilliseconds}</meta>
<dc:contributor id="generator">${BOOK_GENERATOR}</dc:contributor>
<meta refines="#generator" property="role" scheme="marc:relators">bkp</meta>
<dc:date>${reformatDateString(datePublished)}</dc:date>
<meta property="dcterms:modified">${reformatDateString(dateModified)}</meta>
<dc:description>${bookInfo.description}</dc:description>
</metadata>
Expand Down Expand Up @@ -133,7 +138,8 @@ async function getChapterInfo(contentPath, chaptersPath, augmentedChapterDataPat
const manifestAndSpineFiles = [];
let tocHTML = " <ol>\n";
let arcIdCounter = 0;
let lastChapter;
let lastChapter, dateModified;
let dateModifiedTimestamp = 0;
for (const arc of chapterData) {
if (!arc.invisible) {
const arcFilename = `arc${arcIdCounter}.xhtml`;
Expand Down Expand Up @@ -164,6 +170,13 @@ async function getChapterInfo(contentPath, chaptersPath, augmentedChapterDataPat
tocHTML += ` <li><a href="${chapterHref}">${chapter.usedTitle}</a></li>\n`;

lastChapter = chapter;

// The modification date of the book is the latest modification date of any chapter.
const thisDateModifiedTimestamp = (new Date(Date.parse(chapter.dateModified))).getTime();
if (thisDateModifiedTimestamp > dateModifiedTimestamp) {
dateModifiedTimestamp = thisDateModifiedTimestamp;
dateModified = chapter.dateModified;
}
}

if (!arc.invisible) {
Expand All @@ -179,7 +192,7 @@ async function getChapterInfo(contentPath, chaptersPath, augmentedChapterDataPat
// We say that the publication date of the book is equal to the publication date of the last chapter.
const { datePublished } = lastChapter;

return { arcTitlePages, manifestAndSpineFiles, tocHTML, datePublished };
return { arcTitlePages, manifestAndSpineFiles, tocHTML, datePublished, dateModified };
}

async function writeArcTitlePages(chaptersPath, arcTitlePages) {
Expand Down Expand Up @@ -207,3 +220,8 @@ async function writeArcTitlePages(chaptersPath, arcTitlePages) {
function arcPlaintextTitle(arc) {
return `${arc.label}: ${arc.title}`;
}

function reformatDateString(dateString) {
const date = new Date(dateString);
return date.toISOString().replace(".000", "");
}

0 comments on commit 793a02c

Please sign in to comment.