Skip to content

Commit

Permalink
feat: add option to convert Jupyter Notebooks to blog posts
Browse files Browse the repository at this point in the history
  • Loading branch information
gcushen committed Nov 4, 2023
1 parent ff3981e commit 637ab31
Show file tree
Hide file tree
Showing 10 changed files with 2,609 additions and 200 deletions.
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,10 @@ dist/
site/
.idea/
.tox/
.cache/
.cache/

# Test data - temp files
.ipynb_checkpoints

# Trial runs
output/
17 changes: 15 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
[![GitHub followers](https://img.shields.io/github/followers/gcushen?label=Follow%20on%20GH&style=for-the-badge)](https://github.com/gcushen)


### 📚 Easily import publications from your reference manager to your Markdown-formatted website or book
### 📚 Easily import publications and Jupyter notebooks to your Markdown-formatted website or book

![](.github/media/demo.gif)

Expand Down Expand Up @@ -86,6 +86,18 @@ After importing publications, we suggest you:

[Learn more in the Wowchemy Docs](https://university.wowchemy.com).

### Import blog posts from Jupyter Notebooks

Say we have our notebooks in a `notebooks` folder within the website folder, let's import them into the `content/post/` folder:

academic import 'notebooks/*.ipynb' content/post/ --verbose

Optional arguments:

* `--overwrite` Overwrite any existing blog posts in the output folder
* `--verbose` or `-v` Show verbose messages
* `--help` Help

## Contribute

Interested in contributing to **open source** and **open science**?
Expand All @@ -99,7 +111,8 @@ For local development, clone this repository and use Poetry to install and run t
git clone https://github.com/wowchemy/bibtex-to-markdown.git
cd bibtex-to-markdown
poetry install
poetry run academic import tests/data/article.bib output/ --overwrite --compact
poetry run academic import tests/data/article.bib output/publication/ --overwrite --compact
poetry run academic import 'tests/data/**/*.ipynb' output/post/ --overwrite --verbose

When preparing a contribution, run the following checks and ensure that they all pass:

Expand Down
35 changes: 22 additions & 13 deletions academic/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from argparse import RawTextHelpFormatter

from academic.import_bibtex import import_bibtex
from academic.import_notebook import import_notebook

# Initialise logger.
logging.basicConfig(
Expand Down Expand Up @@ -35,7 +36,7 @@ def parse_args(args):

# Sub-parser for import command.
parser_a = subparsers.add_parser("import", help="Import content into your website or book")
parser_a.add_argument("input", type=str, help="File path to your BibTeX file")
parser_a.add_argument("input", type=str, help="File path to your BibTeX or Jupyter Notebook file")
parser_a.add_argument("output", type=str, help="Path to import publications to (e.g. `content/publication/`)")
parser_a.add_argument("--featured", action="store_true", help="Flag publications as featured")
parser_a.add_argument("--overwrite", action="store_true", help="Overwrite existing publications")
Expand Down Expand Up @@ -65,18 +66,26 @@ def parse_args(args):
if known_args.command:
if known_args.verbose:
# Set logging level to debug if verbose mode activated.
logging.getLogger().setLevel(logging.DEBUG)

# Run command to import bibtex.
import_bibtex(
known_args.input,
pub_dir=known_args.output,
featured=known_args.featured,
overwrite=known_args.overwrite,
normalize=known_args.normalize,
compact=known_args.compact,
dry_run=known_args.dry_run,
)
logging.getLogger().setLevel(logging.INFO)
if known_args.input.lower().endswith(".bib"):
# Run command to import bibtex.
import_bibtex(
known_args.input,
pub_dir=known_args.output,
featured=known_args.featured,
overwrite=known_args.overwrite,
normalize=known_args.normalize,
compact=known_args.compact,
dry_run=known_args.dry_run,
)
elif known_args.input.lower().endswith(".ipynb"):
# Run command to import bibtex.
import_notebook(
known_args.input,
output_dir=known_args.output,
overwrite=known_args.overwrite,
dry_run=known_args.dry_run,
)


if __name__ == "__main__":
Expand Down
6 changes: 2 additions & 4 deletions academic/import_bibtex.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,10 @@ def parse_bibtex_entry(

# Prepare YAML front matter for Markdown file.
if not dry_run:
from importlib import resources as impresources
from importlib import resources as import_resources

# Load the Markdown template from within the `templates` folder of the `academic` package
inp_file = impresources.files(__package__ + ".templates") / "publication.md"
with inp_file.open("rt") as f:
template = f.read()
template = import_resources.read_text(__package__ + ".templates", "publication.md")

with open(markdown_path, "w") as f:
f.write(template)
Expand Down
111 changes: 111 additions & 0 deletions academic/import_notebook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import glob
import json
import os
import re
from datetime import datetime
from pathlib import Path

import nbconvert as nbc
import nbformat as nbf
import yaml
from traitlets.config import Config

from academic.jupyter_whitespace_remover import JupyterWhitespaceRemover


def _get_slug(text: str):
return text.lower().replace(" ", "-")


def import_notebook(
input_path,
output_dir=os.path.join("content", "post"),
overwrite=False,
dry_run=False,
):
"""Import blog posts from Jupyter Notebook files"""
from academic.cli import log

for filename in glob.glob(input_path, recursive=True):
if not filename.endswith(".ipynb") or not os.path.basename(filename) != ".ipynb_checkpoints":
return

log.info(f"Found notebook {filename}")

# Read Notebook
nb = nbf.read(open(filename, "r"), as_version=4)

# Export Markdown
nbc_config = Config()
nbc_config.MarkdownExporter.preprocessors = [JupyterWhitespaceRemover]
exporter = nbc.MarkdownExporter(config=nbc_config)
if not dry_run:
_export(nb, exporter, output_dir, filename, ".md", overwrite)


def _export(nb, exporter, output_dir, filename, extension, overwrite):
from academic.cli import log

# Determine output path for page bundle
filename_base = Path(filename).stem
slug = _get_slug(filename_base)
page_bundle_path = Path(output_dir) / slug

# Do not overwrite blog post if it already exists
if not overwrite and os.path.isdir(page_bundle_path):
log.info(f"Skipping creation of {page_bundle_path} as it already exists. " f"To overwrite, add the `--overwrite` argument.")
return

# Create page bundle folder
if not os.path.exists(page_bundle_path):
os.makedirs(page_bundle_path)

# Check for front matter variables in notebook metadata
if "front_matter" in nb["metadata"]:
front_matter_from_file = dict(nb["metadata"]["front_matter"])
log.info(f"Found front matter metadata in notebook: {json.dumps(front_matter_from_file)}")
else:
front_matter_from_file = {}

# Convert notebook to markdown
(body, resources) = exporter.from_notebook_node(nb)

# Export notebook resources
for name, data in resources.get("outputs", {}).items():
output_filename = Path(page_bundle_path) / name
with open(output_filename, "wb") as image_file:
image_file.write(data)

# Try to find title as top-level heading (h1), falling back to filename
search = re.search("^#{1}(.*)", body)
if search:
title = search.group(1).strip()
body = re.sub("^#{1}(.*)", "", body)
else:
title = filename_base.replace("-", " ").title()

# Initialise front matter variables
date = datetime.now().strftime("%Y-%m-%d")
front_matter = {"title": title, "date": date}
front_matter.update(front_matter_from_file)
log.info(f"Generating page with title: {front_matter['title']}")

# Unlike the Bibtex converter, we can't easily use Ruamel YAML library here as we need to output to string
front_matter_yaml = yaml.safe_dump(front_matter, sort_keys=False, allow_unicode=True)
# Strip final newline as our `output` will auto-add newlines below
front_matter_yaml = front_matter_yaml.rstrip()
# Wrap front matter variables with triple hyphens to represent Markdown front matter
output = "\n".join(("---", front_matter_yaml, "---", clean_markdown(body)))

# Write output file
output_filename = os.path.join(page_bundle_path, "index" + extension)
with open(output_filename, "w") as text_file:
text_file.write(output)


def clean_markdown(body: str) -> str:
"""
`nbconvert` creates too much whitespace and newlines.
Try to tidy up the output by removing multiple new lines.
"""
return re.sub(r"\n+(?=\n)", "\n", body)
29 changes: 29 additions & 0 deletions academic/jupyter_whitespace_remover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from nbconvert.preprocessors import Preprocessor


class JupyterWhitespaceRemover(Preprocessor):
"""
Try to clean up a Jupyter notebook by:
- removing blank code cells
- removing unnecessary whitespace
"""

def preprocess(self, nb, resources):
"""
Remove blank `code` cells
"""
for index, cell in enumerate(nb.cells):
if cell.cell_type == "code" and not cell.source:
nb.cells.pop(index)
else:
nb.cells[index], resources = self.preprocess_cell(cell, resources, index)
return nb, resources

def preprocess_cell(self, cell, resources, cell_index):
"""
Remove extraneous whitespace from code cells' source code
"""
if cell.cell_type == "code":
cell.source = cell.source.strip()

return cell, resources
Loading

0 comments on commit 637ab31

Please sign in to comment.