From 7525745a732324c385152081b30ce47ed4671073 Mon Sep 17 00:00:00 2001 From: dyma solovei <53943884+bevzzz@users.noreply.github.com> Date: Mon, 26 Feb 2024 18:13:31 +0100 Subject: [PATCH] Support earlier notebook versions (#13) * feat: decode v4.* notebooks Decoder for v4.4 is reused for all notebooks with major version 4, because they their differences do not affect how the notebooks are rendered: - v4.5 requires that each cell has a unique ID - versions < v4.3 do not have the 'code_cell.metadata.execution' field, which holds the code's execution time. They also do not have 'raw_cell.metadata.jupyterf.source_hidden' which controls if the source is hidden. This has a default behaviour in 'nb' and is probably not that important anyways. Finally, they miss metadata.title field, which is currently not used also in v4.4 notebooks. * feat: decode v3.0 notebooks Prior to v4.0: - top-level 'worksheets' contained multiple worksheets with the actual 'cells' - execution_results was called pyout - error output was called pyerr - code cell 'source' was called 'input'; execution_count was called prompt_number - mime-bundle explicitly defined keys for all mime-tyipes which it supported and had to be decoded differently BREAKING: decode.Decoder interface not inlcudes ExtractCells method to handle the deprecation of top-level 'worksheets' * refactor: extract common schema structs * feat: support v1.0 and v2.0 notebooks Turns out, v1 and v2 only differ in how Jupyter interprets them, not in the schema itself. We can use the same decoder we use for v3. * chore: update version.go to reflect current release version * refactor: create 1 decoder instance per package This has no logic implications, but it feels like multiple instances are unnecessary --- decode/decode.go | 18 +- decode/decode_test.go | 434 +++++++++++++++++++++++++++++++++++++- schema/common/notebook.go | 64 +++++- schema/schema.go | 11 + schema/v3/schema.go | 322 ++++++++++++++++++++++++++++ schema/v4/schema.go | 86 +++----- version.go | 3 +- 7 files changed, 874 insertions(+), 64 deletions(-) create mode 100644 schema/v3/schema.go diff --git a/decode/decode.go b/decode/decode.go index 2db3aa9..ce59ecd 100644 --- a/decode/decode.go +++ b/decode/decode.go @@ -44,8 +44,13 @@ func (n *notebook) UnmarshalJSON(data []byte) error { return fmt.Errorf("%s: notebook metadata: %w", ver, err) } - n.cells = make([]schema.Cell, len(n.Notebook.Cells)) - for i, raw := range n.Notebook.Cells { + cells, err := d.ExtractCells(data) + if err != nil { + return fmt.Errorf("%s: extract cells: %w", ver, err) + } + + n.cells = make([]schema.Cell, len(cells)) + for i, raw := range cells { c := cell{meta: meta, decoder: d} if err := json.Unmarshal(raw, &c); err != nil { return fmt.Errorf("%s: %w", ver, err) @@ -78,7 +83,16 @@ func (c *cell) UnmarshalJSON(data []byte) error { // Decoder implementations are version-aware and decode cell contents and metadata // based on the respective JSON schema definition. type Decoder interface { + // ExtractCells accesses the array of notebook cells. + // + // Prior to v4.0 cells were not a part of the top level structure, + // and were contained in "worksheets" instead. + ExtractCells(data []byte) ([]json.RawMessage, error) + + // DecodeMeta decodes version-specific metadata. DecodeMeta(data []byte) (schema.NotebookMetadata, error) + + // DecodeCell decodes raw cell data to a version-specific implementation. DecodeCell(v map[string]interface{}, data []byte, meta schema.NotebookMetadata) (schema.Cell, error) } diff --git a/decode/decode_test.go b/decode/decode_test.go index d597988..e7f1f23 100644 --- a/decode/decode_test.go +++ b/decode/decode_test.go @@ -6,6 +6,7 @@ import ( "github.com/bevzzz/nb/schema" "github.com/bevzzz/nb/schema/common" + _ "github.com/bevzzz/nb/schema/v3" _ "github.com/bevzzz/nb/schema/v4" "github.com/bevzzz/nb/decode" @@ -37,6 +38,16 @@ func TestDecodeBytes(t *testing.T) { json string nCells int }{ + { + name: "v4.5", + json: `{ + "nbformat": 4, "nbformat_minor": 5, "metadata": {}, "cells": [ + {"id": "a", "cell_type": "markdown", "metadata": {}, "source": []}, + {"id": "b", "cell_type": "markdown", "metadata": {}, "source": []} + ] + }`, + nCells: 2, + }, { name: "v4.4", json: `{ @@ -47,6 +58,91 @@ func TestDecodeBytes(t *testing.T) { }`, nCells: 2, }, + { + name: "v4.3", + json: `{ + "nbformat": 4, "nbformat_minor": 3, "metadata": {}, "cells": [ + {"cell_type": "markdown", "metadata": {}, "source": []}, + {"cell_type": "markdown", "metadata": {}, "source": []} + ] + }`, + nCells: 2, + }, + { + name: "v4.2", + json: `{ + "nbformat": 4, "nbformat_minor": 2, "metadata": {}, "cells": [ + {"cell_type": "markdown", "metadata": {}, "source": []}, + {"cell_type": "markdown", "metadata": {}, "source": []} + ] + }`, + nCells: 2, + }, + { + name: "v4.1", + json: `{ + "nbformat": 4, "nbformat_minor": 1, "metadata": {}, "cells": [ + {"cell_type": "markdown", "metadata": {}, "source": []}, + {"cell_type": "markdown", "metadata": {}, "source": []} + ] + }`, + nCells: 2, + }, + { + name: "v4.0", + json: `{ + "nbformat": 4, "nbformat_minor": 0, "metadata": {}, "cells": [ + {"cell_type": "markdown", "metadata": {}, "source": []}, + {"cell_type": "markdown", "metadata": {}, "source": []} + ] + }`, + nCells: 2, + }, + { + name: "v3.0", + json: `{ + "nbformat": 3, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + {"cell_type": "markdown", "metadata": {}, "source": []}, + {"cell_type": "markdown", "metadata": {}, "source": []} + ]}, + {"cells": [ + {"cell_type": "markdown", "metadata": {}, "source": []} + ]} + ] + }`, + nCells: 3, + }, + { + name: "v2.0", + json: `{ + "nbformat": 2, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + {"cell_type": "markdown", "metadata": {}, "source": []}, + {"cell_type": "markdown", "metadata": {}, "source": []} + ]}, + {"cells": [ + {"cell_type": "markdown", "metadata": {}, "source": []} + ]} + ] + }`, + nCells: 3, + }, + { + name: "v1.0", + json: `{ + "nbformat": 1, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + {"cell_type": "markdown", "metadata": {}, "source": []}, + {"cell_type": "markdown", "metadata": {}, "source": []} + ]}, + {"cells": [ + {"cell_type": "markdown", "metadata": {}, "source": []} + ]} + ] + }`, + nCells: 3, + }, } { t.Run(tt.name, func(t *testing.T) { nb, err := decode.Bytes([]byte(tt.json)) @@ -174,6 +270,21 @@ func TestDecodeBytes(t *testing.T) { Data: []byte("base64-encoded-image-data"), }, }, + { + name: "v3.0: no explicit mime-type", + json: `{ + "nbformat": 3, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + {"cell_type": "raw", "source": ["sometimes you just want to rawdog sqweel"]} + ]} + ] + }`, + want: WithAttachments{Cell: Cell{ + Type: schema.Raw, + MimeType: common.PlainText, + Text: []byte("sometimes you just want to rawdog sqweel"), + }}, + }, } { t.Run(tt.name, func(t *testing.T) { nb, err := decode.Bytes([]byte(tt.json)) @@ -209,7 +320,8 @@ func TestDecodeBytes(t *testing.T) { { "cell_type": "code", "execution_count": 5, "source": ["print('Hi, mom!')"], "outputs": [ - {"output_type": "stream"}, {"output_type": "stream"} + {"output_type": "stream", "name": "stdout"}, + {"output_type": "stream", "name": "stderr"} ] } ] @@ -225,6 +337,32 @@ func TestDecodeBytes(t *testing.T) { OutputLen: 2, }, }, + { + name: "v3.0", + json: `{ + "nbformat": 3, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + { + "cell_type": "code", "language": "javascript", "prompt_number": 5, + "input": ["print('Hi, mom!')"], "outputs": [ + {"output_type": "stream", "stream": "stdout"}, + {"output_type": "stream", "stream": "stderr"} + ] + } + ]} + ] + }`, + want: outcome{ + Cell: Cell{ + Type: schema.Code, + MimeType: "application/x-python", // FIXME: expect language-specific mime-type + Text: []byte("print('Hi, mom!')"), + }, + Language: "javascript", + ExecutionCount: 5, + OutputLen: 2, + }, + }, } { t.Run(tt.name, func(t *testing.T) { nb, err := decode.Bytes([]byte(tt.json)) @@ -271,6 +409,28 @@ func TestDecodeBytes(t *testing.T) { }}, }, }, + { + name: "v3.0: stream output to stdout", + json: `{ + "nbformat": 3, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + {"cell_type": "code", "outputs": [ + { + "output_type": "stream", "stream": "stdout", + "text": ["$> ls\n", ".\n", "..\n", "nb/"] + } + ]} + ]} + ] + }`, + want: []output{ + {Cell: Cell{ + Type: schema.Stream, + MimeType: common.Stdout, + Text: []byte("$> ls\n.\n..\nnb/"), + }}, + }, + }, { name: "v4.4: stream output to stderr", json: `{ @@ -292,6 +452,28 @@ func TestDecodeBytes(t *testing.T) { }}, }, }, + { + name: "v3.0: stream output to stderr", + json: `{ + "nbformat": 3, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + {"cell_type": "code", "outputs": [ + { + "output_type": "stream", "stream": "stderr", + "text": ["KeyError: ", "dict['unknown key']"] + } + ]} + ]} + ] + }`, + want: []output{ + {Cell: Cell{ + Type: schema.Stream, + MimeType: common.Stderr, + Text: []byte("KeyError: dict['unknown key']"), + }}, + }, + }, { name: "v4.4: stream output to unrecognized target", json: `{ @@ -321,13 +503,13 @@ func TestDecodeBytes(t *testing.T) { {"cell_type": "code", "outputs": [ {"output_type": "display_data", "metadata": {}, "data": { - "image/png": "base64-encoded-png-image", + "image/png": "base64-encoded-png-image", "text/plain": "
" } }, {"output_type": "display_data", "metadata": {}, "data": { - "image/jpeg": "base64-encoded-jpeg-image", + "image/jpeg": "base64-encoded-jpeg-image", "text/plain": "
" } }, @@ -357,6 +539,93 @@ func TestDecodeBytes(t *testing.T) { }}, }, }, + { + name: "v3.0: display_data output different recognized formats", + json: `{ + "nbformat": 3, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + {"cell_type": "code", "outputs": [ + {"output_type": "display_data", "metadata": {}, + "png": ["base64-encoded-png-image"], + "text": ["
"] + }, + {"output_type": "display_data", "metadata": {}, + "jpeg": ["base64-encoded-jpeg-image"], + "text": ["
"] + }, + {"output_type": "display_data", "metadata": {}, + "html": [""] + }, + {"output_type": "display_data", "metadata": {}, + "svg": [""] + }, + {"output_type": "display_data", "metadata": {}, + "javascript": ["[,,,].length"] + }, + {"output_type": "display_data", "metadata": {}, + "json": ["{\"foo\": \"bar\"}"] + }, + {"output_type": "display_data", "metadata": {}, + "pdf": ["some-raw-pdf-data"] + }, + {"output_type": "display_data", "metadata": {}, + "latex": ["c = \\sqrt{a^2 + b^2}"] + }, + {"output_type": "display_data", "metadata": {}, + "text": [""] + } + ]} + ]} + ] + }`, + want: []output{ + {Cell: Cell{ + Type: schema.DisplayData, + MimeType: "image/png", + Text: []byte("base64-encoded-png-image"), + }}, + {Cell: Cell{ + Type: schema.DisplayData, + MimeType: "image/jpeg", + Text: []byte("base64-encoded-jpeg-image"), + }}, + {Cell: Cell{ + Type: schema.DisplayData, + MimeType: "text/html", + Text: []byte(``), + }}, + {Cell: Cell{ + Type: schema.DisplayData, + MimeType: "image/svg+xml", + Text: []byte(``), + }}, + {Cell: Cell{ + Type: schema.DisplayData, + MimeType: "text/javascript", + Text: []byte("[,,,].length"), + }}, + {Cell: Cell{ + Type: schema.DisplayData, + MimeType: "application/json", + Text: []byte("{\"foo\": \"bar\"}"), // ???? + }}, + {Cell: Cell{ + Type: schema.DisplayData, + MimeType: "application/pdf", + Text: []byte("some-raw-pdf-data"), // ???? + }}, + {Cell: Cell{ + Type: schema.DisplayData, + MimeType: "application/x-latex", + Text: []byte("c = \\sqrt{a^2 + b^2}"), // ???? + }}, + {Cell: Cell{ + Type: schema.DisplayData, + MimeType: common.PlainText, + Text: []byte(""), + }}, + }, + }, { name: "v4.4: execute_result output with several images and a plain text", json: `{ @@ -391,6 +660,101 @@ func TestDecodeBytes(t *testing.T) { }}, }, }, + { + name: "v3.0: pyout (execute_result) output different recognized formats", + json: `{ + "nbformat": 3, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + {"cell_type": "code", "outputs": [ + {"output_type": "pyout", "metadata": {}, + "prompt_number": 42, + "png": ["base64-encoded-png-image"], + "text": ["
"] + }, + {"output_type": "pyout", "metadata": {}, + "prompt_number": 42, + "jpeg": ["base64-encoded-jpeg-image"], + "text": ["
"] + }, + {"output_type": "pyout", "metadata": {}, + "prompt_number": 42, + "html": [""] + }, + {"output_type": "pyout", "metadata": {}, + "prompt_number": 42, + "svg": [""] + }, + {"output_type": "pyout", "metadata": {}, + "prompt_number": 42, + "javascript": ["[,,,].length"] + }, + {"output_type": "pyout", "metadata": {}, + "prompt_number": 42, + "json": ["{\"foo\": \"bar\"}"] + }, + {"output_type": "pyout", "metadata": {}, + "pdf": ["some-raw-pdf-data"] + }, + {"output_type": "pyout", "metadata": {}, + "prompt_number": 42, + "latex": ["c = \\sqrt{a^2 + b^2}"] + }, + {"output_type": "pyout", "metadata": {}, + "prompt_number": 42, + "text": [""] + } + ]} + ]} + ] + }`, + want: []output{ + {ExecutionCount: 42, Cell: Cell{ + Type: schema.ExecuteResult, + MimeType: "image/png", + Text: []byte("base64-encoded-png-image"), + }}, + {ExecutionCount: 42, Cell: Cell{ + Type: schema.ExecuteResult, + MimeType: "image/jpeg", + Text: []byte("base64-encoded-jpeg-image"), + }}, + {ExecutionCount: 42, Cell: Cell{ + Type: schema.ExecuteResult, + MimeType: "text/html", + Text: []byte(``), + }}, + {ExecutionCount: 42, Cell: Cell{ + Type: schema.ExecuteResult, + MimeType: "image/svg+xml", + Text: []byte(``), + }}, + {ExecutionCount: 42, Cell: Cell{ + Type: schema.ExecuteResult, + MimeType: "text/javascript", + Text: []byte("[,,,].length"), + }}, + {ExecutionCount: 42, Cell: Cell{ + Type: schema.ExecuteResult, + MimeType: "application/json", + Text: []byte("{\"foo\": \"bar\"}"), // ???? + }}, + {ExecutionCount: 42, Cell: Cell{ + Type: schema.ExecuteResult, + MimeType: "application/pdf", + Text: []byte("some-raw-pdf-data"), // ???? + }}, + {ExecutionCount: 42, Cell: Cell{ + Type: schema.ExecuteResult, + MimeType: "application/x-latex", + Text: []byte("c = \\sqrt{a^2 + b^2}"), // ???? + }}, + {ExecutionCount: 42, Cell: Cell{ + Type: schema.ExecuteResult, + MimeType: common.PlainText, + Text: []byte(""), + }}, + }, + }, { name: "v4.4: error output", json: `{ @@ -417,6 +781,33 @@ func TestDecodeBytes(t *testing.T) { }}, }, }, + { + name: "v3.0: error output", + json: `{ + "nbformat": 3, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + {"cell_type": "code", "outputs": [ + { + "output_type": "pyerr", "ename": "ZeroDivisionError", "evalue": "division by zero", + "traceback": [ + "Traceback (most recent call last):", + "\tFile \"main.py\", line 3, in ", + "\t\tprint(n/0)", + "\tZeroDivisionError: division by zero" + ] + } + ]} + ]} + ] + }`, + want: []output{ + {Cell: Cell{ + Type: schema.Error, + MimeType: common.Stderr, + Text: []byte("Traceback (most recent call last):\n\tFile \"main.py\", line 3, in \n\t\tprint(n/0)\n\tZeroDivisionError: division by zero"), + }}, + }, + }, } { t.Run(tt.name, func(t *testing.T) { nb, err := decode.Bytes([]byte(tt.json)) @@ -433,6 +824,43 @@ func TestDecodeBytes(t *testing.T) { }) } }) + + t.Run("heading cells", func(t *testing.T) { + for _, tt := range []struct { + name string + json string + want Cell + }{ + { + name: "v3.0 used to have dedicated heading cells", + json: `{ + "nbformat": 3, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + { + "cell_type": "heading", "level": 2, + "source": ["Fun facts about Ronald McDonald"], "metadata": {} + } + ]} + ] + }`, + want: Cell{ + Type: schema.Markdown, + MimeType: common.MarkdownText, + Text: []byte("## Fun facts about Ronald McDonald"), + }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + nb, err := decode.Bytes([]byte(tt.json)) + require.NoError(t, err) + + got := nb.Cells() + require.Len(t, got, 1, "expected 1 cell") + + checkCell(t, got[0], tt.want) + }) + } + }) } // checkCell compares the cell's type and content to expected. diff --git a/schema/common/notebook.go b/schema/common/notebook.go index 8246fe7..ae289ab 100644 --- a/schema/common/notebook.go +++ b/schema/common/notebook.go @@ -7,10 +7,9 @@ import ( ) type Notebook struct { - VersionMajor int `json:"nbformat"` - VersionMinor int `json:"nbformat_minor"` - Metadata json.RawMessage `json:"metadata"` // TODO: omitempty - Cells []json.RawMessage `json:"cells"` + VersionMajor int `json:"nbformat"` + VersionMinor int `json:"nbformat_minor"` + Metadata json.RawMessage `json:"metadata"` // TODO: omitempty } func (n *Notebook) Version() schema.Version { @@ -26,3 +25,60 @@ const ( Stdout = "application/vnd.jupyter.stdout" // Custom mime-type for stream output to stdout. Stderr = "application/vnd.jupyter.stderr" // Custom mime-type for stream output to stderr. ) + +// Markdown defines the schema for a "markdown" cell. +type Markdown struct { + Source MultilineString `json:"source"` +} + +var _ schema.Cell = (*Markdown)(nil) + +func (md *Markdown) Type() schema.CellType { + return schema.Markdown +} + +func (md *Markdown) MimeType() string { + return MarkdownText +} + +func (md *Markdown) Text() []byte { + return md.Source.Text() +} + +// Raw defines the schema for a "raw" cell. +type Raw struct { + Source MultilineString `json:"source"` + Metadata RawCellMetadata `json:"metadata"` +} + +var _ schema.Cell = (*Raw)(nil) + +func (raw *Raw) Type() schema.CellType { + return schema.Raw +} + +func (raw *Raw) MimeType() string { + return raw.Metadata.MimeType() +} + +func (raw *Raw) Text() []byte { + return raw.Source.Text() +} + +// RawCellMetadata may specify a target conversion format. +type RawCellMetadata struct { + Format *string `json:"format"` + RawMimeType *string `json:"raw_mimetype"` +} + +// MimeType returns a more specific mime-type if one is provided and "text/plain" otherwise. +func (raw *RawCellMetadata) MimeType() string { + switch { + case raw.Format != nil: + return *raw.Format + case raw.RawMimeType != nil: + return *raw.RawMimeType + default: + return PlainText + } +} diff --git a/schema/schema.go b/schema/schema.go index e6df60d..d510239 100644 --- a/schema/schema.go +++ b/schema/schema.go @@ -1,3 +1,11 @@ +// Package schema defines the common data format for elements of a Jupyter notebook. +// +// It is based on the [v4.4] definition, as it is stable and encompasses all the data +// necessary for accurate rendering. Note, that schema validation is not a goal of this +// package, and so, interfaces defined here will often omit the non-essential data, +// e.g. metadata or fields specific to JupyterLab environment. +// +// [v4.4]: https://github.com/jupyter/nbformat/blob/main/nbformat/v4/nbformat.v4.4.schema.json package schema import ( @@ -34,6 +42,9 @@ type Cell interface { Text() []byte } +// HasAttachments is implemented by cells which include [cell attachments]. +// +// [cell attachments]: https://nbformat.readthedocs.io/en/latest/format_description.html#cell-attachments type HasAttachments interface { // Attachments are only defined for v4.0 and above for markdown and raw cells // and may be omitted in the JSON. Cells without attachments should return nil. diff --git a/schema/v3/schema.go b/schema/v3/schema.go new file mode 100644 index 0000000..c50fb45 --- /dev/null +++ b/schema/v3/schema.go @@ -0,0 +1,322 @@ +// Package v3 provides a decoder for Jupyter Notebooks v1.0, v2.0, and v3.0. +// +// It implements the IPython Notebook v3.0 JSON Schema, which is also suitable +// for decoding all earlier versions, as there hasn't been any breaking changes +// to it. +// +// [IPython Notebook v3.0 JSON Schema]: https://github.com/jupyter/nbformat/blob/main/nbformat/v3/nbformat.v3.schema.json +package v3 + +import ( + "bytes" + "encoding/json" + "fmt" + "strings" + + "github.com/bevzzz/nb/decode" + "github.com/bevzzz/nb/schema" + "github.com/bevzzz/nb/schema/common" +) + +func init() { + d := new(decoder) + decode.RegisterDecoder(schema.Version{Major: 3, Minor: 0}, d) + decode.RegisterDecoder(schema.Version{Major: 2, Minor: 0}, d) + decode.RegisterDecoder(schema.Version{Major: 1, Minor: 0}, d) +} + +// decoder decodes cell contents and metadata for nbformat v3.0, v2.0, and v1.0. +type decoder struct{} + +var _ decode.Decoder = (*decoder)(nil) + +func (d *decoder) ExtractCells(data []byte) ([]json.RawMessage, error) { + var raw struct { + Worksheets []struct { + Cells []json.RawMessage `json:"cells"` + } `json:"worksheets"` + } + if err := json.Unmarshal(data, &raw); err != nil { + return nil, err + } + + var cells []json.RawMessage + for i := range raw.Worksheets { + cells = append(cells, raw.Worksheets[i].Cells...) + } + return cells, nil +} + +func (d *decoder) DecodeMeta(data []byte) (schema.NotebookMetadata, error) { + return nil, nil +} + +func (d *decoder) DecodeCell(m map[string]interface{}, data []byte, meta schema.NotebookMetadata) (schema.Cell, error) { + var ct interface{} + var c schema.Cell + switch ct = m["cell_type"]; ct { + case "markdown": + c = &Markdown{} + case "heading": + c = &Heading{} + case "raw": + c = &Raw{} + case "code": + c = &Code{} + default: + return nil, fmt.Errorf("unknown cell type %q", ct) + } + if err := json.Unmarshal(data, &c); err != nil { + return nil, fmt.Errorf("%s: %w", ct, err) + } + return c, nil +} + +type ( + Markdown = common.Markdown + Raw = common.Raw +) + +// Heading is a dedicated cell type which represent a heading in a Jupyter notebook. +// This type is deprecated in the later versions and the content is stored as markdown instead. +// +// Heading cell behaves exactly like a markdown cell, decorating its source with the +// appropriate number of heading signs (#). +type Heading struct { + Markdown + Level int `json:"level"` +} + +var _ schema.Cell = (*Heading)(nil) + +func (h *Heading) Text() []byte { + hashes := append(bytes.Repeat([]byte("#"), h.Level), " "...) + return append(hashes, h.Source.Text()...) +} + +// Code defines the schema for a "code" cell. +type Code struct { + Source common.MultilineString `json:"input"` + TimesExecuted int `json:"prompt_number"` + Out []Output `json:"outputs"` + Lang string `json:"language"` +} + +var _ schema.CodeCell = (*Code)(nil) +var _ schema.Outputter = (*Code)(nil) + +func (code *Code) Type() schema.CellType { + return schema.Code +} + +// FIXME: return correct mime type (add a function to common) +func (code *Code) MimeType() string { + return "application/x-python" +} + +func (code *Code) Text() []byte { + return code.Source.Text() +} + +func (code *Code) Language() string { + return code.Lang +} + +func (code *Code) ExecutionCount() int { + return code.TimesExecuted +} + +func (code *Code) Outputs() (cells []schema.Cell) { + for i := range code.Out { + cells = append(cells, code.Out[i].cell) + } + return +} + +// Outputs unmarshals cell outputs into schema.Cell based on their type. +type Output struct { + cell schema.Cell +} + +func (out *Output) UnmarshalJSON(data []byte) error { + var v map[string]interface{} + if err := json.Unmarshal(data, &v); err != nil { + return fmt.Errorf("code outputs: %w", err) + } + + var t interface{} + var c schema.Cell + switch t = v["output_type"]; t { + case "stream": + c = &StreamOutput{} + case "display_data": + c = &DisplayDataOutput{} + case "pyout": + c = &ExecuteResultOutput{} + case "pyerr": + c = &ErrorOutput{} + default: + return fmt.Errorf("unknown output type %q", t) + } + + if err := json.Unmarshal(data, &c); err != nil { + return fmt.Errorf("%q output: %w", t, err) + } + out.cell = c + return nil +} + +// StreamOutput is a plain, text-based output of the executed code. +// Depending on the stream "target", Type() can report "text/plain" (stdout) or "error" (stderr). +// The output is often decorated with ANSI-color sequences, which should be handled separately. +type StreamOutput struct { + // Target can be stdout or stderr. + Target string `json:"stream"` + Source common.MultilineString `json:"text"` +} + +var _ schema.Cell = (*StreamOutput)(nil) + +func (stream *StreamOutput) Type() schema.CellType { + return schema.Stream +} + +func (stream *StreamOutput) MimeType() string { + switch stream.Target { + case "stdout": + return common.Stdout + case "stderr": + return common.Stderr + } + return common.PlainText +} + +func (stream *StreamOutput) Text() []byte { + return stream.Source.Text() +} + +// DisplayDataOutput are rich-format outputs generated by running the code in the parent cell. +type DisplayDataOutput struct { + MimeBundle + Metadata map[string]interface{} `json:"metadata"` +} + +var _ schema.Cell = (*DisplayDataOutput)(nil) + +func (dd *DisplayDataOutput) Type() schema.CellType { + return schema.DisplayData +} + +// MimeBundle contains rich output data keyed by mime-type. +type MimeBundle struct { + PNG common.MultilineString `json:"png,omitempty"` + JPEG common.MultilineString `json:"jpeg,omitempty"` + HTML common.MultilineString `json:"html,omitempty"` + SVG common.MultilineString `json:"svg,omitempty"` + Javascript common.MultilineString `json:"javascript,omitempty"` + JSON common.MultilineString `json:"json,omitempty"` + PDF common.MultilineString `json:"pdf,omitempty"` + LaTeX common.MultilineString `json:"latex,omitempty"` + Txt common.MultilineString `json:"text,omitempty"` +} + +var _ schema.MimeBundle = (*MimeBundle)(nil) + +// MimeType returns the richer of the mime-types present in the bundle, +// and falls back to "text/plain" otherwise. +func (mb MimeBundle) MimeType() string { + switch { + case mb.PNG != nil: + return "image/png" + case mb.JPEG != nil: + return "image/jpeg" + case mb.HTML != nil: + return "text/html" + case mb.SVG != nil: + return "image/svg+xml" + case mb.Javascript != nil: + return "text/javascript" + case mb.JSON != nil: + return "application/json" + case mb.PDF != nil: + return "application/pdf" + case mb.LaTeX != nil: + return "application/x-latex" + } + return common.PlainText +} + +// Text returns data with the richer mime-type. +func (mb MimeBundle) Text() []byte { + return mb.Data(mb.MimeType()) +} + +// Data returns mime-type-specific content if present and a nil slice otherwise. +func (mb MimeBundle) Data(mime string) []byte { + switch mime { + case "image/png": + return mb.PNG.Text() + case "image/jpeg": + return mb.JPEG.Text() + case "text/html": + return mb.HTML.Text() + case "image/svg+xml": + return mb.SVG.Text() + case "text/javascript": + return mb.Javascript.Text() + case "application/json": + return mb.JSON.Text() + case "application/pdf": + return mb.PDF.Text() + case "application/x-latex": + return mb.LaTeX.Text() + case common.PlainText: + return mb.Txt.Text() + } + return nil +} + +// PlainText returns data for "text/plain" mime-type and a nil slice otherwise. +func (mb MimeBundle) PlainText() []byte { + return mb.Data(common.PlainText) +} + +// ExecuteResultOutput is the result of executing the code in the cell. +// Its contents are identical to those of DisplayDataOutput with the addition of the execution count. +type ExecuteResultOutput struct { + DisplayDataOutput + TimesExecuted int `json:"prompt_number"` +} + +var _ schema.Cell = (*ExecuteResultOutput)(nil) +var _ schema.ExecutionCounter = (*ExecuteResultOutput)(nil) + +func (ex *ExecuteResultOutput) Type() schema.CellType { + return schema.ExecuteResult +} + +func (ex *ExecuteResultOutput) ExecutionCount() int { + return ex.TimesExecuted +} + +// ErrorOutput stores the output of a failed code execution. +type ErrorOutput struct { + ExceptionName string `json:"ename"` + ExceptionValue string `json:"evalue"` + Traceback []string `json:"traceback"` +} + +var _ schema.Cell = (*ErrorOutput)(nil) + +func (err *ErrorOutput) Type() schema.CellType { + return schema.Error +} + +func (err *ErrorOutput) MimeType() string { + return common.Stderr +} + +func (err *ErrorOutput) Text() (txt []byte) { + s := strings.Join(err.Traceback, "\n") + return []byte(s) +} diff --git a/schema/v4/schema.go b/schema/v4/schema.go index 940f122..c13a7c2 100644 --- a/schema/v4/schema.go +++ b/schema/v4/schema.go @@ -1,3 +1,9 @@ +// Package v4 provides a decoder for Jupyter Notebooks v4.0 and later minor versions. +// +// It implements the IPython Notebook v4.0 JSON Schema. Other minor versions can be decoded using the same, +// as the differences do not affect how the notebook is rendered. +// +// [IPython Notebook v4.0 JSON Schema]: https://github.com/jupyter/nbformat/blob/main/nbformat/v4/nbformat.v4.0.schema.json package v4 import ( @@ -11,13 +17,30 @@ import ( ) func init() { - decode.RegisterDecoder(version, new(decoder)) + d := new(decoder) + decode.RegisterDecoder(schema.Version{Major: 4, Minor: 5}, d) + decode.RegisterDecoder(schema.Version{Major: 4, Minor: 4}, d) + decode.RegisterDecoder(schema.Version{Major: 4, Minor: 3}, d) + decode.RegisterDecoder(schema.Version{Major: 4, Minor: 2}, d) + decode.RegisterDecoder(schema.Version{Major: 4, Minor: 1}, d) + decode.RegisterDecoder(schema.Version{Major: 4, Minor: 0}, d) } -var version = schema.Version{Major: 4, Minor: 4} - +// decoder decodes cell contents and metadata for nbformat v4.0. type decoder struct{} +var _ decode.Decoder = (*decoder)(nil) + +func (d *decoder) ExtractCells(data []byte) ([]json.RawMessage, error) { + var raw struct { + Cells []json.RawMessage `json:"cells"` + } + if err := json.Unmarshal(data, &raw); err != nil { + return nil, err + } + return raw.Cells, nil +} + func (d *decoder) DecodeMeta(data []byte) (schema.NotebookMetadata, error) { var nm NotebookMetadata if err := json.Unmarshal(data, &nm); err != nil { @@ -60,51 +83,24 @@ func (nm *NotebookMetadata) Language() string { // Markdown defines the schema for a "markdown" cell. type Markdown struct { - Att Attachments `json:"attachments,omitempty"` - Source common.MultilineString `json:"source"` + common.Markdown + Att Attachments `json:"attachments,omitempty"` } -var _ schema.Cell = (*Markdown)(nil) var _ schema.HasAttachments = (*Markdown)(nil) -func (md *Markdown) Type() schema.CellType { - return schema.Markdown -} - -func (md *Markdown) MimeType() string { - return common.MarkdownText -} - -func (md *Markdown) Text() []byte { - return md.Source.Text() -} - func (md *Markdown) Attachments() schema.Attachments { return md.Att } // Raw defines the schema for a "raw" cell. type Raw struct { - Att Attachments `json:"attachments,omitempty"` - Source common.MultilineString `json:"source"` - Metadata RawCellMetadata `json:"metadata"` + common.Raw + Att Attachments `json:"attachments,omitempty"` } -var _ schema.Cell = (*Raw)(nil) var _ schema.HasAttachments = (*Raw)(nil) -func (raw *Raw) Type() schema.CellType { - return schema.Raw -} - -func (raw *Raw) MimeType() string { - return raw.Metadata.MimeType() -} - -func (raw *Raw) Text() []byte { - return raw.Source.Text() -} - func (raw *Raw) Attachments() schema.Attachments { return raw.Att } @@ -122,24 +118,6 @@ func (att Attachments) MimeBundle(filename string) schema.MimeBundle { return mb } -// RawCellMetadata may specify a target conversion format. -type RawCellMetadata struct { - Format *string `json:"format"` - RawMimeType *string `json:"raw_mimetype"` -} - -// MimeType returns a more specific mime-type if one is provided and "text/plain" otherwise. -func (raw *RawCellMetadata) MimeType() string { - switch { - case raw.Format != nil: - return *raw.Format - case raw.RawMimeType != nil: - return *raw.RawMimeType - default: - return common.PlainText - } -} - // Code defines the schema for a "code" cell. type Code struct { Source common.MultilineString `json:"source"` @@ -155,7 +133,7 @@ func (code *Code) Type() schema.CellType { return schema.Code } -// TODO: return correct mime type (add a function to common) +// FIXME: return correct mime type (add a function to common) func (code *Code) MimeType() string { return "application/x-python" } @@ -295,7 +273,7 @@ func (mb MimeBundle) Data(mime string) []byte { return nil } -// RawText returns data for "text/plain" mime-type and a nil slice otherwise. +// PlainText returns data for "text/plain" mime-type and a nil slice otherwise. func (mb MimeBundle) PlainText() []byte { return mb.Data(common.PlainText) } diff --git a/version.go b/version.go index 3ac4da1..2a4b750 100644 --- a/version.go +++ b/version.go @@ -2,10 +2,11 @@ package nb import ( // Currently supported nbformat versions: + _ "github.com/bevzzz/nb/schema/v3" _ "github.com/bevzzz/nb/schema/v4" ) // Version returns current release version. func Version() string { - return "v0.2.0" + return "v0.2.1" }