-
-
Notifications
You must be signed in to change notification settings - Fork 104
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add option to convert Jupyter Notebooks to blog posts
- Loading branch information
Showing
10 changed files
with
2,609 additions
and
200 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,4 +6,10 @@ dist/ | |
site/ | ||
.idea/ | ||
.tox/ | ||
.cache/ | ||
.cache/ | ||
|
||
# Test data - temp files | ||
.ipynb_checkpoints | ||
|
||
# Trial runs | ||
output/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
import glob | ||
import json | ||
import os | ||
import re | ||
from datetime import datetime | ||
from pathlib import Path | ||
|
||
import nbconvert as nbc | ||
import nbformat as nbf | ||
import yaml | ||
from traitlets.config import Config | ||
|
||
from academic.jupyter_whitespace_remover import JupyterWhitespaceRemover | ||
|
||
|
||
def _get_slug(text: str): | ||
return text.lower().replace(" ", "-") | ||
|
||
|
||
def import_notebook( | ||
input_path, | ||
output_dir=os.path.join("content", "post"), | ||
overwrite=False, | ||
dry_run=False, | ||
): | ||
"""Import blog posts from Jupyter Notebook files""" | ||
from academic.cli import log | ||
|
||
for filename in glob.glob(input_path, recursive=True): | ||
if not filename.endswith(".ipynb") or not os.path.basename(filename) != ".ipynb_checkpoints": | ||
return | ||
|
||
log.info(f"Found notebook {filename}") | ||
|
||
# Read Notebook | ||
nb = nbf.read(open(filename, "r"), as_version=4) | ||
|
||
# Export Markdown | ||
nbc_config = Config() | ||
nbc_config.MarkdownExporter.preprocessors = [JupyterWhitespaceRemover] | ||
exporter = nbc.MarkdownExporter(config=nbc_config) | ||
if not dry_run: | ||
_export(nb, exporter, output_dir, filename, ".md", overwrite) | ||
|
||
|
||
def _export(nb, exporter, output_dir, filename, extension, overwrite): | ||
from academic.cli import log | ||
|
||
# Determine output path for page bundle | ||
filename_base = Path(filename).stem | ||
slug = _get_slug(filename_base) | ||
page_bundle_path = Path(output_dir) / slug | ||
|
||
# Do not overwrite blog post if it already exists | ||
if not overwrite and os.path.isdir(page_bundle_path): | ||
log.info(f"Skipping creation of {page_bundle_path} as it already exists. " f"To overwrite, add the `--overwrite` argument.") | ||
return | ||
|
||
# Create page bundle folder | ||
if not os.path.exists(page_bundle_path): | ||
os.makedirs(page_bundle_path) | ||
|
||
# Check for front matter variables in notebook metadata | ||
if "front_matter" in nb["metadata"]: | ||
front_matter_from_file = dict(nb["metadata"]["front_matter"]) | ||
log.info(f"Found front matter metadata in notebook: {json.dumps(front_matter_from_file)}") | ||
else: | ||
front_matter_from_file = {} | ||
|
||
# Convert notebook to markdown | ||
(body, resources) = exporter.from_notebook_node(nb) | ||
|
||
# Export notebook resources | ||
for name, data in resources.get("outputs", {}).items(): | ||
output_filename = Path(page_bundle_path) / name | ||
with open(output_filename, "wb") as image_file: | ||
image_file.write(data) | ||
|
||
# Try to find title as top-level heading (h1), falling back to filename | ||
search = re.search("^#{1}(.*)", body) | ||
if search: | ||
title = search.group(1).strip() | ||
body = re.sub("^#{1}(.*)", "", body) | ||
else: | ||
title = filename_base.replace("-", " ").title() | ||
|
||
# Initialise front matter variables | ||
date = datetime.now().strftime("%Y-%m-%d") | ||
front_matter = {"title": title, "date": date} | ||
front_matter.update(front_matter_from_file) | ||
log.info(f"Generating page with title: {front_matter['title']}") | ||
|
||
# Unlike the Bibtex converter, we can't easily use Ruamel YAML library here as we need to output to string | ||
front_matter_yaml = yaml.safe_dump(front_matter, sort_keys=False, allow_unicode=True) | ||
# Strip final newline as our `output` will auto-add newlines below | ||
front_matter_yaml = front_matter_yaml.rstrip() | ||
# Wrap front matter variables with triple hyphens to represent Markdown front matter | ||
output = "\n".join(("---", front_matter_yaml, "---", clean_markdown(body))) | ||
|
||
# Write output file | ||
output_filename = os.path.join(page_bundle_path, "index" + extension) | ||
with open(output_filename, "w") as text_file: | ||
text_file.write(output) | ||
|
||
|
||
def clean_markdown(body: str) -> str: | ||
""" | ||
`nbconvert` creates too much whitespace and newlines. | ||
Try to tidy up the output by removing multiple new lines. | ||
""" | ||
return re.sub(r"\n+(?=\n)", "\n", body) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
from nbconvert.preprocessors import Preprocessor | ||
|
||
|
||
class JupyterWhitespaceRemover(Preprocessor): | ||
""" | ||
Try to clean up a Jupyter notebook by: | ||
- removing blank code cells | ||
- removing unnecessary whitespace | ||
""" | ||
|
||
def preprocess(self, nb, resources): | ||
""" | ||
Remove blank `code` cells | ||
""" | ||
for index, cell in enumerate(nb.cells): | ||
if cell.cell_type == "code" and not cell.source: | ||
nb.cells.pop(index) | ||
else: | ||
nb.cells[index], resources = self.preprocess_cell(cell, resources, index) | ||
return nb, resources | ||
|
||
def preprocess_cell(self, cell, resources, cell_index): | ||
""" | ||
Remove extraneous whitespace from code cells' source code | ||
""" | ||
if cell.cell_type == "code": | ||
cell.source = cell.source.strip() | ||
|
||
return cell, resources |
Oops, something went wrong.