From 96767ed409957a233462b937395947d951b5c5ea Mon Sep 17 00:00:00 2001 From: dyma solovei Date: Fri, 23 Feb 2024 18:01:21 +0100 Subject: [PATCH 1/6] feat: decode v4.* notebooks Decoder for v4.4 is reused for all notebooks with major version 4, because they their differences do not affect how the notebooks are rendered: - v4.5 requires that each cell has a unique ID - versions < v4.3 do not have the 'code_cell.metadata.execution' field, which holds the code's execution time. They also do not have 'raw_cell.metadata.jupyterf.source_hidden' which controls if the source is hidden. This has a default behaviour in 'nb' and is probably not that important anyways. Finally, they miss metadata.title field, which is currently not used also in v4.4 notebooks. --- decode/decode_test.go | 50 +++++++++++++++++++++++++++++++++++++++++++ schema/schema.go | 11 ++++++++++ schema/v4/schema.go | 14 +++++++++--- 3 files changed, 72 insertions(+), 3 deletions(-) diff --git a/decode/decode_test.go b/decode/decode_test.go index d597988..2d55660 100644 --- a/decode/decode_test.go +++ b/decode/decode_test.go @@ -37,6 +37,16 @@ func TestDecodeBytes(t *testing.T) { json string nCells int }{ + { + name: "v4.5", + json: `{ + "nbformat": 4, "nbformat_minor": 5, "metadata": {}, "cells": [ + {"id": "a", "cell_type": "markdown", "metadata": {}, "source": []}, + {"id": "b", "cell_type": "markdown", "metadata": {}, "source": []} + ] + }`, + nCells: 2, + }, { name: "v4.4", json: `{ @@ -47,6 +57,46 @@ func TestDecodeBytes(t *testing.T) { }`, nCells: 2, }, + { + name: "v4.3", + json: `{ + "nbformat": 4, "nbformat_minor": 3, "metadata": {}, "cells": [ + {"cell_type": "markdown", "metadata": {}, "source": []}, + {"cell_type": "markdown", "metadata": {}, "source": []} + ] + }`, + nCells: 2, + }, + { + name: "v4.2", + json: `{ + "nbformat": 4, "nbformat_minor": 2, "metadata": {}, "cells": [ + {"cell_type": "markdown", "metadata": {}, "source": []}, + {"cell_type": "markdown", "metadata": {}, "source": []} + ] + }`, + nCells: 2, + }, + { + name: "v4.1", + json: `{ + "nbformat": 4, "nbformat_minor": 1, "metadata": {}, "cells": [ + {"cell_type": "markdown", "metadata": {}, "source": []}, + {"cell_type": "markdown", "metadata": {}, "source": []} + ] + }`, + nCells: 2, + }, + { + name: "v4.0", + json: `{ + "nbformat": 4, "nbformat_minor": 0, "metadata": {}, "cells": [ + {"cell_type": "markdown", "metadata": {}, "source": []}, + {"cell_type": "markdown", "metadata": {}, "source": []} + ] + }`, + nCells: 2, + }, } { t.Run(tt.name, func(t *testing.T) { nb, err := decode.Bytes([]byte(tt.json)) diff --git a/schema/schema.go b/schema/schema.go index e6df60d..d510239 100644 --- a/schema/schema.go +++ b/schema/schema.go @@ -1,3 +1,11 @@ +// Package schema defines the common data format for elements of a Jupyter notebook. +// +// It is based on the [v4.4] definition, as it is stable and encompasses all the data +// necessary for accurate rendering. Note, that schema validation is not a goal of this +// package, and so, interfaces defined here will often omit the non-essential data, +// e.g. metadata or fields specific to JupyterLab environment. +// +// [v4.4]: https://github.com/jupyter/nbformat/blob/main/nbformat/v4/nbformat.v4.4.schema.json package schema import ( @@ -34,6 +42,9 @@ type Cell interface { Text() []byte } +// HasAttachments is implemented by cells which include [cell attachments]. +// +// [cell attachments]: https://nbformat.readthedocs.io/en/latest/format_description.html#cell-attachments type HasAttachments interface { // Attachments are only defined for v4.0 and above for markdown and raw cells // and may be omitted in the JSON. Cells without attachments should return nil. diff --git a/schema/v4/schema.go b/schema/v4/schema.go index 940f122..630613a 100644 --- a/schema/v4/schema.go +++ b/schema/v4/schema.go @@ -11,13 +11,21 @@ import ( ) func init() { - decode.RegisterDecoder(version, new(decoder)) + decode.RegisterDecoder(schema.Version{Major: 4, Minor: 5}, new(decoder)) + decode.RegisterDecoder(schema.Version{Major: 4, Minor: 4}, new(decoder)) + decode.RegisterDecoder(schema.Version{Major: 4, Minor: 3}, new(decoder)) + decode.RegisterDecoder(schema.Version{Major: 4, Minor: 2}, new(decoder)) + decode.RegisterDecoder(schema.Version{Major: 4, Minor: 1}, new(decoder)) + decode.RegisterDecoder(schema.Version{Major: 4, Minor: 0}, new(decoder)) } -var version = schema.Version{Major: 4, Minor: 4} - +// decoder decodes cell contents and metadata for nbformat v4.4. +// Other versions can be decoded using the same, as their schema +// differs in ways that does not affect how the notebook is rendered. type decoder struct{} +var _ decode.Decoder = (*decoder)(nil) + func (d *decoder) DecodeMeta(data []byte) (schema.NotebookMetadata, error) { var nm NotebookMetadata if err := json.Unmarshal(data, &nm); err != nil { From 230f0ff0080e5bf60ba77873ec2e973ab246756b Mon Sep 17 00:00:00 2001 From: dyma solovei Date: Sat, 24 Feb 2024 17:25:35 +0100 Subject: [PATCH 2/6] feat: decode v3.0 notebooks Prior to v4.0: - top-level 'worksheets' contained multiple worksheets with the actual 'cells' - execution_results was called pyout - error output was called pyerr - code cell 'source' was called 'input'; execution_count was called prompt_number - mime-bundle explicitly defined keys for all mime-tyipes which it supported and had to be decoded differently BREAKING: decode.Decoder interface not inlcudes ExtractCells method to handle the deprecation of top-level 'worksheets' --- decode/decode.go | 18 +- decode/decode_test.go | 356 +++++++++++++++++++++++++++++++++++- schema/common/notebook.go | 1 - schema/v3/schema.go | 372 ++++++++++++++++++++++++++++++++++++++ schema/v4/schema.go | 14 +- version.go | 1 + 6 files changed, 753 insertions(+), 9 deletions(-) create mode 100644 schema/v3/schema.go diff --git a/decode/decode.go b/decode/decode.go index 2db3aa9..ce59ecd 100644 --- a/decode/decode.go +++ b/decode/decode.go @@ -44,8 +44,13 @@ func (n *notebook) UnmarshalJSON(data []byte) error { return fmt.Errorf("%s: notebook metadata: %w", ver, err) } - n.cells = make([]schema.Cell, len(n.Notebook.Cells)) - for i, raw := range n.Notebook.Cells { + cells, err := d.ExtractCells(data) + if err != nil { + return fmt.Errorf("%s: extract cells: %w", ver, err) + } + + n.cells = make([]schema.Cell, len(cells)) + for i, raw := range cells { c := cell{meta: meta, decoder: d} if err := json.Unmarshal(raw, &c); err != nil { return fmt.Errorf("%s: %w", ver, err) @@ -78,7 +83,16 @@ func (c *cell) UnmarshalJSON(data []byte) error { // Decoder implementations are version-aware and decode cell contents and metadata // based on the respective JSON schema definition. type Decoder interface { + // ExtractCells accesses the array of notebook cells. + // + // Prior to v4.0 cells were not a part of the top level structure, + // and were contained in "worksheets" instead. + ExtractCells(data []byte) ([]json.RawMessage, error) + + // DecodeMeta decodes version-specific metadata. DecodeMeta(data []byte) (schema.NotebookMetadata, error) + + // DecodeCell decodes raw cell data to a version-specific implementation. DecodeCell(v map[string]interface{}, data []byte, meta schema.NotebookMetadata) (schema.Cell, error) } diff --git a/decode/decode_test.go b/decode/decode_test.go index 2d55660..7279145 100644 --- a/decode/decode_test.go +++ b/decode/decode_test.go @@ -6,6 +6,7 @@ import ( "github.com/bevzzz/nb/schema" "github.com/bevzzz/nb/schema/common" + _ "github.com/bevzzz/nb/schema/v3" _ "github.com/bevzzz/nb/schema/v4" "github.com/bevzzz/nb/decode" @@ -94,9 +95,24 @@ func TestDecodeBytes(t *testing.T) { {"cell_type": "markdown", "metadata": {}, "source": []}, {"cell_type": "markdown", "metadata": {}, "source": []} ] - }`, + }`, nCells: 2, }, + { + name: "v3.0", + json: `{ + "nbformat": 3, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + {"cell_type": "markdown", "metadata": {}, "source": []}, + {"cell_type": "markdown", "metadata": {}, "source": []} + ]}, + {"cells": [ + {"cell_type": "markdown", "metadata": {}, "source": []} + ]} + ] + }`, + nCells: 3, + }, } { t.Run(tt.name, func(t *testing.T) { nb, err := decode.Bytes([]byte(tt.json)) @@ -224,6 +240,21 @@ func TestDecodeBytes(t *testing.T) { Data: []byte("base64-encoded-image-data"), }, }, + { + name: "v3.0: no explicit mime-type", + json: `{ + "nbformat": 3, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + {"cell_type": "raw", "source": ["sometimes you just want to rawdog sqweel"]} + ]} + ] + }`, + want: WithAttachments{Cell: Cell{ + Type: schema.Raw, + MimeType: common.PlainText, + Text: []byte("sometimes you just want to rawdog sqweel"), + }}, + }, } { t.Run(tt.name, func(t *testing.T) { nb, err := decode.Bytes([]byte(tt.json)) @@ -259,7 +290,8 @@ func TestDecodeBytes(t *testing.T) { { "cell_type": "code", "execution_count": 5, "source": ["print('Hi, mom!')"], "outputs": [ - {"output_type": "stream"}, {"output_type": "stream"} + {"output_type": "stream", "name": "stdout"}, + {"output_type": "stream", "name": "stderr"} ] } ] @@ -275,6 +307,32 @@ func TestDecodeBytes(t *testing.T) { OutputLen: 2, }, }, + { + name: "v3.0", + json: `{ + "nbformat": 3, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + { + "cell_type": "code", "language": "javascript", "prompt_number": 5, + "input": ["print('Hi, mom!')"], "outputs": [ + {"output_type": "stream", "stream": "stdout"}, + {"output_type": "stream", "stream": "stderr"} + ] + } + ]} + ] + }`, + want: outcome{ + Cell: Cell{ + Type: schema.Code, + MimeType: "application/x-python", // FIXME: expect language-specific mime-type + Text: []byte("print('Hi, mom!')"), + }, + Language: "javascript", + ExecutionCount: 5, + OutputLen: 2, + }, + }, } { t.Run(tt.name, func(t *testing.T) { nb, err := decode.Bytes([]byte(tt.json)) @@ -321,6 +379,28 @@ func TestDecodeBytes(t *testing.T) { }}, }, }, + { + name: "v3.0: stream output to stdout", + json: `{ + "nbformat": 3, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + {"cell_type": "code", "outputs": [ + { + "output_type": "stream", "stream": "stdout", + "text": ["$> ls\n", ".\n", "..\n", "nb/"] + } + ]} + ]} + ] + }`, + want: []output{ + {Cell: Cell{ + Type: schema.Stream, + MimeType: common.Stdout, + Text: []byte("$> ls\n.\n..\nnb/"), + }}, + }, + }, { name: "v4.4: stream output to stderr", json: `{ @@ -342,6 +422,28 @@ func TestDecodeBytes(t *testing.T) { }}, }, }, + { + name: "v3.0: stream output to stderr", + json: `{ + "nbformat": 3, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + {"cell_type": "code", "outputs": [ + { + "output_type": "stream", "stream": "stderr", + "text": ["KeyError: ", "dict['unknown key']"] + } + ]} + ]} + ] + }`, + want: []output{ + {Cell: Cell{ + Type: schema.Stream, + MimeType: common.Stderr, + Text: []byte("KeyError: dict['unknown key']"), + }}, + }, + }, { name: "v4.4: stream output to unrecognized target", json: `{ @@ -371,13 +473,13 @@ func TestDecodeBytes(t *testing.T) { {"cell_type": "code", "outputs": [ {"output_type": "display_data", "metadata": {}, "data": { - "image/png": "base64-encoded-png-image", + "image/png": "base64-encoded-png-image", "text/plain": "
" } }, {"output_type": "display_data", "metadata": {}, "data": { - "image/jpeg": "base64-encoded-jpeg-image", + "image/jpeg": "base64-encoded-jpeg-image", "text/plain": "
" } }, @@ -407,6 +509,93 @@ func TestDecodeBytes(t *testing.T) { }}, }, }, + { + name: "v3.0: display_data output different recognized formats", + json: `{ + "nbformat": 3, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + {"cell_type": "code", "outputs": [ + {"output_type": "display_data", "metadata": {}, + "png": ["base64-encoded-png-image"], + "text": ["
"] + }, + {"output_type": "display_data", "metadata": {}, + "jpeg": ["base64-encoded-jpeg-image"], + "text": ["
"] + }, + {"output_type": "display_data", "metadata": {}, + "html": [""] + }, + {"output_type": "display_data", "metadata": {}, + "svg": [""] + }, + {"output_type": "display_data", "metadata": {}, + "javascript": ["[,,,].length"] + }, + {"output_type": "display_data", "metadata": {}, + "json": ["{\"foo\": \"bar\"}"] + }, + {"output_type": "display_data", "metadata": {}, + "pdf": ["some-raw-pdf-data"] + }, + {"output_type": "display_data", "metadata": {}, + "latex": ["c = \\sqrt{a^2 + b^2}"] + }, + {"output_type": "display_data", "metadata": {}, + "text": [""] + } + ]} + ]} + ] + }`, + want: []output{ + {Cell: Cell{ + Type: schema.DisplayData, + MimeType: "image/png", + Text: []byte("base64-encoded-png-image"), + }}, + {Cell: Cell{ + Type: schema.DisplayData, + MimeType: "image/jpeg", + Text: []byte("base64-encoded-jpeg-image"), + }}, + {Cell: Cell{ + Type: schema.DisplayData, + MimeType: "text/html", + Text: []byte(``), + }}, + {Cell: Cell{ + Type: schema.DisplayData, + MimeType: "image/svg+xml", + Text: []byte(``), + }}, + {Cell: Cell{ + Type: schema.DisplayData, + MimeType: "text/javascript", + Text: []byte("[,,,].length"), + }}, + {Cell: Cell{ + Type: schema.DisplayData, + MimeType: "application/json", + Text: []byte("{\"foo\": \"bar\"}"), // ???? + }}, + {Cell: Cell{ + Type: schema.DisplayData, + MimeType: "application/pdf", + Text: []byte("some-raw-pdf-data"), // ???? + }}, + {Cell: Cell{ + Type: schema.DisplayData, + MimeType: "application/x-latex", + Text: []byte("c = \\sqrt{a^2 + b^2}"), // ???? + }}, + {Cell: Cell{ + Type: schema.DisplayData, + MimeType: common.PlainText, + Text: []byte(""), + }}, + }, + }, { name: "v4.4: execute_result output with several images and a plain text", json: `{ @@ -441,6 +630,101 @@ func TestDecodeBytes(t *testing.T) { }}, }, }, + { + name: "v3.0: pyout (execute_result) output different recognized formats", + json: `{ + "nbformat": 3, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + {"cell_type": "code", "outputs": [ + {"output_type": "pyout", "metadata": {}, + "prompt_number": 42, + "png": ["base64-encoded-png-image"], + "text": ["
"] + }, + {"output_type": "pyout", "metadata": {}, + "prompt_number": 42, + "jpeg": ["base64-encoded-jpeg-image"], + "text": ["
"] + }, + {"output_type": "pyout", "metadata": {}, + "prompt_number": 42, + "html": [""] + }, + {"output_type": "pyout", "metadata": {}, + "prompt_number": 42, + "svg": [""] + }, + {"output_type": "pyout", "metadata": {}, + "prompt_number": 42, + "javascript": ["[,,,].length"] + }, + {"output_type": "pyout", "metadata": {}, + "prompt_number": 42, + "json": ["{\"foo\": \"bar\"}"] + }, + {"output_type": "pyout", "metadata": {}, + "pdf": ["some-raw-pdf-data"] + }, + {"output_type": "pyout", "metadata": {}, + "prompt_number": 42, + "latex": ["c = \\sqrt{a^2 + b^2}"] + }, + {"output_type": "pyout", "metadata": {}, + "prompt_number": 42, + "text": [""] + } + ]} + ]} + ] + }`, + want: []output{ + {ExecutionCount: 42, Cell: Cell{ + Type: schema.ExecuteResult, + MimeType: "image/png", + Text: []byte("base64-encoded-png-image"), + }}, + {ExecutionCount: 42, Cell: Cell{ + Type: schema.ExecuteResult, + MimeType: "image/jpeg", + Text: []byte("base64-encoded-jpeg-image"), + }}, + {ExecutionCount: 42, Cell: Cell{ + Type: schema.ExecuteResult, + MimeType: "text/html", + Text: []byte(``), + }}, + {ExecutionCount: 42, Cell: Cell{ + Type: schema.ExecuteResult, + MimeType: "image/svg+xml", + Text: []byte(``), + }}, + {ExecutionCount: 42, Cell: Cell{ + Type: schema.ExecuteResult, + MimeType: "text/javascript", + Text: []byte("[,,,].length"), + }}, + {ExecutionCount: 42, Cell: Cell{ + Type: schema.ExecuteResult, + MimeType: "application/json", + Text: []byte("{\"foo\": \"bar\"}"), // ???? + }}, + {ExecutionCount: 42, Cell: Cell{ + Type: schema.ExecuteResult, + MimeType: "application/pdf", + Text: []byte("some-raw-pdf-data"), // ???? + }}, + {ExecutionCount: 42, Cell: Cell{ + Type: schema.ExecuteResult, + MimeType: "application/x-latex", + Text: []byte("c = \\sqrt{a^2 + b^2}"), // ???? + }}, + {ExecutionCount: 42, Cell: Cell{ + Type: schema.ExecuteResult, + MimeType: common.PlainText, + Text: []byte(""), + }}, + }, + }, { name: "v4.4: error output", json: `{ @@ -467,6 +751,33 @@ func TestDecodeBytes(t *testing.T) { }}, }, }, + { + name: "v3.0: error output", + json: `{ + "nbformat": 3, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + {"cell_type": "code", "outputs": [ + { + "output_type": "pyerr", "ename": "ZeroDivisionError", "evalue": "division by zero", + "traceback": [ + "Traceback (most recent call last):", + "\tFile \"main.py\", line 3, in ", + "\t\tprint(n/0)", + "\tZeroDivisionError: division by zero" + ] + } + ]} + ]} + ] + }`, + want: []output{ + {Cell: Cell{ + Type: schema.Error, + MimeType: common.Stderr, + Text: []byte("Traceback (most recent call last):\n\tFile \"main.py\", line 3, in \n\t\tprint(n/0)\n\tZeroDivisionError: division by zero"), + }}, + }, + }, } { t.Run(tt.name, func(t *testing.T) { nb, err := decode.Bytes([]byte(tt.json)) @@ -483,6 +794,43 @@ func TestDecodeBytes(t *testing.T) { }) } }) + + t.Run("heading cells", func(t *testing.T) { + for _, tt := range []struct { + name string + json string + want Cell + }{ + { + name: "v3.0 used to have dedicated heading cells", + json: `{ + "nbformat": 3, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + { + "cell_type": "heading", "level": 2, + "source": ["Fun facts about Ronald McDonald"], "metadata": {} + } + ]} + ] + }`, + want: Cell{ + Type: schema.Markdown, + MimeType: common.MarkdownText, + Text: []byte("## Fun facts about Ronald McDonald"), + }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + nb, err := decode.Bytes([]byte(tt.json)) + require.NoError(t, err) + + got := nb.Cells() + require.Len(t, got, 1, "expected 1 cell") + + checkCell(t, got[0], tt.want) + }) + } + }) } // checkCell compares the cell's type and content to expected. diff --git a/schema/common/notebook.go b/schema/common/notebook.go index 8246fe7..482e50d 100644 --- a/schema/common/notebook.go +++ b/schema/common/notebook.go @@ -10,7 +10,6 @@ type Notebook struct { VersionMajor int `json:"nbformat"` VersionMinor int `json:"nbformat_minor"` Metadata json.RawMessage `json:"metadata"` // TODO: omitempty - Cells []json.RawMessage `json:"cells"` } func (n *Notebook) Version() schema.Version { diff --git a/schema/v3/schema.go b/schema/v3/schema.go new file mode 100644 index 0000000..a6cbbff --- /dev/null +++ b/schema/v3/schema.go @@ -0,0 +1,372 @@ +package v3 + +import ( + "bytes" + "encoding/json" + "fmt" + "strings" + + "github.com/bevzzz/nb/decode" + "github.com/bevzzz/nb/schema" + "github.com/bevzzz/nb/schema/common" +) + +func init() { + decode.RegisterDecoder(schema.Version{Major: 3, Minor: 0}, new(decoder)) +} + +// decoder decodes cell contents and metadata for nbformat v3.0. +type decoder struct{} + +var _ decode.Decoder = (*decoder)(nil) + +func (d *decoder) ExtractCells(data []byte) ([]json.RawMessage, error) { + var raw struct { + Worksheets []struct { + Cells []json.RawMessage `json:"cells"` + } `json:"worksheets"` + } + if err := json.Unmarshal(data, &raw); err != nil { + return nil, err + } + + var cells []json.RawMessage + for i := range raw.Worksheets { + cells = append(cells, raw.Worksheets[i].Cells...) + } + return cells, nil +} + +func (d *decoder) DecodeMeta(data []byte) (schema.NotebookMetadata, error) { + return nil, nil +} + +func (d *decoder) DecodeCell(m map[string]interface{}, data []byte, meta schema.NotebookMetadata) (schema.Cell, error) { + var ct interface{} + var c schema.Cell + switch ct = m["cell_type"]; ct { + case "markdown": + c = &Markdown{} + case "heading": + c = &Heading{} + case "raw": + c = &Raw{} + case "code": + c = &Code{} + default: + return nil, fmt.Errorf("unknown cell type %q", ct) + } + if err := json.Unmarshal(data, &c); err != nil { + return nil, fmt.Errorf("%s: %w", ct, err) + } + return c, nil +} + +// Markdown defines the schema for a "markdown" cell. +type Markdown struct { + Source common.MultilineString `json:"source"` +} + +var _ schema.Cell = (*Markdown)(nil) + +func (md *Markdown) Type() schema.CellType { + return schema.Markdown +} + +func (md *Markdown) MimeType() string { + return common.MarkdownText +} + +func (md *Markdown) Text() []byte { + return md.Source.Text() +} + +// Heading is a dedicated cell type which represent a heading in a Jupyter notebook. +// This type is deprecated in the later versions and the content is stored as markdown instead. +// +// Heading cell behaves exactly like a markdown cell, decorating its source with the +// appropriate number of heading signs (#). +type Heading struct { + Source common.MultilineString `json:"source"` + Level int +} + +var _ schema.Cell = (*Heading)(nil) + +func (h *Heading) Type() schema.CellType { + return schema.Markdown +} + +func (h *Heading) MimeType() string { + return common.MarkdownText +} + +func (h *Heading) Text() []byte { + hashes := append(bytes.Repeat([]byte("#"), h.Level), " "...) + return append(hashes, h.Source.Text()...) +} + +// Raw defines the schema for a "raw" cell. +type Raw struct { + Source common.MultilineString `json:"source"` + Metadata RawCellMetadata `json:"metadata"` +} + +var _ schema.Cell = (*Raw)(nil) + +func (raw *Raw) Type() schema.CellType { + return schema.Raw +} + +func (raw *Raw) MimeType() string { + return raw.Metadata.MimeType() +} + +func (raw *Raw) Text() []byte { + return raw.Source.Text() +} + +// RawCellMetadata may specify a target conversion format. +type RawCellMetadata struct { + Format *string `json:"format"` + RawMimeType *string `json:"raw_mimetype"` +} + +// MimeType returns a more specific mime-type if one is provided and "text/plain" otherwise. +func (raw *RawCellMetadata) MimeType() string { + switch { + case raw.Format != nil: + return *raw.Format + case raw.RawMimeType != nil: + return *raw.RawMimeType + default: + return common.PlainText + } +} + +// Code defines the schema for a "code" cell. +type Code struct { + Source common.MultilineString `json:"input"` + TimesExecuted int `json:"prompt_number"` + Out []Output `json:"outputs"` + Lang string `json:"language"` +} + +var _ schema.CodeCell = (*Code)(nil) +var _ schema.Outputter = (*Code)(nil) + +func (code *Code) Type() schema.CellType { + return schema.Code +} + +// FIXME: return correct mime type (add a function to common) +func (code *Code) MimeType() string { + return "application/x-python" +} + +func (code *Code) Text() []byte { + return code.Source.Text() +} + +func (code *Code) Language() string { + return code.Lang +} + +func (code *Code) ExecutionCount() int { + return code.TimesExecuted +} + +func (code *Code) Outputs() (cells []schema.Cell) { + for i := range code.Out { + cells = append(cells, code.Out[i].cell) + } + return +} + +// Outputs unmarshals cell outputs into schema.Cell based on their type. +type Output struct { + cell schema.Cell +} + +func (out *Output) UnmarshalJSON(data []byte) error { + var v map[string]interface{} + if err := json.Unmarshal(data, &v); err != nil { + return fmt.Errorf("code outputs: %w", err) + } + + var t interface{} + var c schema.Cell + switch t = v["output_type"]; t { + case "stream": + c = &StreamOutput{} + case "display_data": + c = &DisplayDataOutput{} + case "pyout": + c = &ExecuteResultOutput{} + case "pyerr": + c = &ErrorOutput{} + default: + return fmt.Errorf("unknown output type %q", t) + } + + if err := json.Unmarshal(data, &c); err != nil { + return fmt.Errorf("%q output: %w", t, err) + } + out.cell = c + return nil +} + +// StreamOutput is a plain, text-based output of the executed code. +// Depending on the stream "target", Type() can report "text/plain" (stdout) or "error" (stderr). +// The output is often decorated with ANSI-color sequences, which should be handled separately. +type StreamOutput struct { + // Target can be stdout or stderr. + Target string `json:"stream"` + Source common.MultilineString `json:"text"` +} + +var _ schema.Cell = (*StreamOutput)(nil) + +func (stream *StreamOutput) Type() schema.CellType { + return schema.Stream +} + +func (stream *StreamOutput) MimeType() string { + switch stream.Target { + case "stdout": + return common.Stdout + case "stderr": + return common.Stderr + } + return common.PlainText +} + +func (stream *StreamOutput) Text() []byte { + return stream.Source.Text() +} + +// DisplayDataOutput are rich-format outputs generated by running the code in the parent cell. +type DisplayDataOutput struct { + MimeBundle + Metadata map[string]interface{} `json:"metadata"` +} + +var _ schema.Cell = (*DisplayDataOutput)(nil) + +func (dd *DisplayDataOutput) Type() schema.CellType { + return schema.DisplayData +} + +// MimeBundle contains rich output data keyed by mime-type. +type MimeBundle struct { + PNG common.MultilineString `json:"png,omitempty"` + JPEG common.MultilineString `json:"jpeg,omitempty"` + HTML common.MultilineString `json:"html,omitempty"` + SVG common.MultilineString `json:"svg,omitempty"` + Javascript common.MultilineString `json:"javascript,omitempty"` + JSON common.MultilineString `json:"json,omitempty"` + PDF common.MultilineString `json:"pdf,omitempty"` + LaTeX common.MultilineString `json:"latex,omitempty"` + Txt common.MultilineString `json:"text,omitempty"` +} + +var _ schema.MimeBundle = (*MimeBundle)(nil) + +// MimeType returns the richer of the mime-types present in the bundle, +// and falls back to "text/plain" otherwise. +func (mb MimeBundle) MimeType() string { + switch { + case mb.PNG != nil: + return "image/png" + case mb.JPEG != nil: + return "image/jpeg" + case mb.HTML != nil: + return "text/html" + case mb.SVG != nil: + return "image/svg+xml" + case mb.Javascript != nil: + return "text/javascript" + case mb.JSON != nil: + return "application/json" + case mb.PDF != nil: + return "application/pdf" + case mb.LaTeX != nil: + return "application/x-latex" + } + return common.PlainText +} + +// Text returns data with the richer mime-type. +func (mb MimeBundle) Text() []byte { + return mb.Data(mb.MimeType()) +} + +// Data returns mime-type-specific content if present and a nil slice otherwise. +func (mb MimeBundle) Data(mime string) []byte { + switch mime { + case "image/png": + return mb.PNG.Text() + case "image/jpeg": + return mb.JPEG.Text() + case "text/html": + return mb.HTML.Text() + case "image/svg+xml": + return mb.SVG.Text() + case "text/javascript": + return mb.Javascript.Text() + case "application/json": + return mb.JSON.Text() + case "application/pdf": + return mb.PDF.Text() + case "application/x-latex": + return mb.LaTeX.Text() + case common.PlainText: + return mb.Txt.Text() + } + return nil +} + +// PlainText returns data for "text/plain" mime-type and a nil slice otherwise. +func (mb MimeBundle) PlainText() []byte { + return mb.Data(common.PlainText) +} + +// ExecuteResultOutput is the result of executing the code in the cell. +// Its contents are identical to those of DisplayDataOutput with the addition of the execution count. +type ExecuteResultOutput struct { + DisplayDataOutput + TimesExecuted int `json:"prompt_number"` +} + +var _ schema.Cell = (*ExecuteResultOutput)(nil) +var _ schema.ExecutionCounter = (*ExecuteResultOutput)(nil) + +func (ex *ExecuteResultOutput) Type() schema.CellType { + return schema.ExecuteResult +} + +func (ex *ExecuteResultOutput) ExecutionCount() int { + return ex.TimesExecuted +} + +// ErrorOutput stores the output of a failed code execution. +type ErrorOutput struct { + ExceptionName string `json:"ename"` + ExceptionValue string `json:"evalue"` + Traceback []string `json:"traceback"` +} + +var _ schema.Cell = (*ErrorOutput)(nil) + +func (err *ErrorOutput) Type() schema.CellType { + return schema.Error +} + +func (err *ErrorOutput) MimeType() string { + return common.Stderr +} + +func (err *ErrorOutput) Text() (txt []byte) { + s := strings.Join(err.Traceback, "\n") + return []byte(s) +} diff --git a/schema/v4/schema.go b/schema/v4/schema.go index 630613a..c4b9369 100644 --- a/schema/v4/schema.go +++ b/schema/v4/schema.go @@ -26,6 +26,16 @@ type decoder struct{} var _ decode.Decoder = (*decoder)(nil) +func (d *decoder) ExtractCells(data []byte) ([]json.RawMessage, error) { + var raw struct { + Cells []json.RawMessage `json:"cells"` + } + if err := json.Unmarshal(data, &raw); err != nil { + return nil, err + } + return raw.Cells, nil +} + func (d *decoder) DecodeMeta(data []byte) (schema.NotebookMetadata, error) { var nm NotebookMetadata if err := json.Unmarshal(data, &nm); err != nil { @@ -163,7 +173,7 @@ func (code *Code) Type() schema.CellType { return schema.Code } -// TODO: return correct mime type (add a function to common) +// FIXME: return correct mime type (add a function to common) func (code *Code) MimeType() string { return "application/x-python" } @@ -303,7 +313,7 @@ func (mb MimeBundle) Data(mime string) []byte { return nil } -// RawText returns data for "text/plain" mime-type and a nil slice otherwise. +// PlainText returns data for "text/plain" mime-type and a nil slice otherwise. func (mb MimeBundle) PlainText() []byte { return mb.Data(common.PlainText) } diff --git a/version.go b/version.go index 3ac4da1..8e26083 100644 --- a/version.go +++ b/version.go @@ -2,6 +2,7 @@ package nb import ( // Currently supported nbformat versions: + _ "github.com/bevzzz/nb/schema/v3" _ "github.com/bevzzz/nb/schema/v4" ) From 87c1543088e8fbb62482edd2f0963bc1880afdeb Mon Sep 17 00:00:00 2001 From: dyma solovei Date: Mon, 26 Feb 2024 17:58:28 +0100 Subject: [PATCH 3/6] refactor: extract common schema structs --- schema/common/notebook.go | 63 +++++++++++++++++++++++++++++-- schema/v3/schema.go | 79 +++++++-------------------------------- schema/v4/schema.go | 63 ++++++------------------------- 3 files changed, 84 insertions(+), 121 deletions(-) diff --git a/schema/common/notebook.go b/schema/common/notebook.go index 482e50d..ae289ab 100644 --- a/schema/common/notebook.go +++ b/schema/common/notebook.go @@ -7,9 +7,9 @@ import ( ) type Notebook struct { - VersionMajor int `json:"nbformat"` - VersionMinor int `json:"nbformat_minor"` - Metadata json.RawMessage `json:"metadata"` // TODO: omitempty + VersionMajor int `json:"nbformat"` + VersionMinor int `json:"nbformat_minor"` + Metadata json.RawMessage `json:"metadata"` // TODO: omitempty } func (n *Notebook) Version() schema.Version { @@ -25,3 +25,60 @@ const ( Stdout = "application/vnd.jupyter.stdout" // Custom mime-type for stream output to stdout. Stderr = "application/vnd.jupyter.stderr" // Custom mime-type for stream output to stderr. ) + +// Markdown defines the schema for a "markdown" cell. +type Markdown struct { + Source MultilineString `json:"source"` +} + +var _ schema.Cell = (*Markdown)(nil) + +func (md *Markdown) Type() schema.CellType { + return schema.Markdown +} + +func (md *Markdown) MimeType() string { + return MarkdownText +} + +func (md *Markdown) Text() []byte { + return md.Source.Text() +} + +// Raw defines the schema for a "raw" cell. +type Raw struct { + Source MultilineString `json:"source"` + Metadata RawCellMetadata `json:"metadata"` +} + +var _ schema.Cell = (*Raw)(nil) + +func (raw *Raw) Type() schema.CellType { + return schema.Raw +} + +func (raw *Raw) MimeType() string { + return raw.Metadata.MimeType() +} + +func (raw *Raw) Text() []byte { + return raw.Source.Text() +} + +// RawCellMetadata may specify a target conversion format. +type RawCellMetadata struct { + Format *string `json:"format"` + RawMimeType *string `json:"raw_mimetype"` +} + +// MimeType returns a more specific mime-type if one is provided and "text/plain" otherwise. +func (raw *RawCellMetadata) MimeType() string { + switch { + case raw.Format != nil: + return *raw.Format + case raw.RawMimeType != nil: + return *raw.RawMimeType + default: + return PlainText + } +} diff --git a/schema/v3/schema.go b/schema/v3/schema.go index a6cbbff..6772ecb 100644 --- a/schema/v3/schema.go +++ b/schema/v3/schema.go @@ -1,3 +1,10 @@ +// Package v3 provides a decoder for Jupyter Notebooks v1.0, v2.0, and v3.0. +// +// It implements the IPython Notebook v3.0 JSON Schema, which is also suitable +// for decoding all earlier versions, as there hasn't been any breaking changes +// to it. +// +// [IPython Notebook v3.0 JSON Schema]: https://github.com/jupyter/nbformat/blob/main/nbformat/v3/nbformat.v3.schema.json package v3 import ( @@ -62,24 +69,10 @@ func (d *decoder) DecodeCell(m map[string]interface{}, data []byte, meta schema. return c, nil } -// Markdown defines the schema for a "markdown" cell. -type Markdown struct { - Source common.MultilineString `json:"source"` -} - -var _ schema.Cell = (*Markdown)(nil) - -func (md *Markdown) Type() schema.CellType { - return schema.Markdown -} - -func (md *Markdown) MimeType() string { - return common.MarkdownText -} - -func (md *Markdown) Text() []byte { - return md.Source.Text() -} +type ( + Markdown = common.Markdown + Raw = common.Raw +) // Heading is a dedicated cell type which represent a heading in a Jupyter notebook. // This type is deprecated in the later versions and the content is stored as markdown instead. @@ -87,63 +80,17 @@ func (md *Markdown) Text() []byte { // Heading cell behaves exactly like a markdown cell, decorating its source with the // appropriate number of heading signs (#). type Heading struct { - Source common.MultilineString `json:"source"` - Level int + Markdown + Level int `json:"level"` } var _ schema.Cell = (*Heading)(nil) -func (h *Heading) Type() schema.CellType { - return schema.Markdown -} - -func (h *Heading) MimeType() string { - return common.MarkdownText -} - func (h *Heading) Text() []byte { hashes := append(bytes.Repeat([]byte("#"), h.Level), " "...) return append(hashes, h.Source.Text()...) } -// Raw defines the schema for a "raw" cell. -type Raw struct { - Source common.MultilineString `json:"source"` - Metadata RawCellMetadata `json:"metadata"` -} - -var _ schema.Cell = (*Raw)(nil) - -func (raw *Raw) Type() schema.CellType { - return schema.Raw -} - -func (raw *Raw) MimeType() string { - return raw.Metadata.MimeType() -} - -func (raw *Raw) Text() []byte { - return raw.Source.Text() -} - -// RawCellMetadata may specify a target conversion format. -type RawCellMetadata struct { - Format *string `json:"format"` - RawMimeType *string `json:"raw_mimetype"` -} - -// MimeType returns a more specific mime-type if one is provided and "text/plain" otherwise. -func (raw *RawCellMetadata) MimeType() string { - switch { - case raw.Format != nil: - return *raw.Format - case raw.RawMimeType != nil: - return *raw.RawMimeType - default: - return common.PlainText - } -} - // Code defines the schema for a "code" cell. type Code struct { Source common.MultilineString `json:"input"` diff --git a/schema/v4/schema.go b/schema/v4/schema.go index c4b9369..fb64e55 100644 --- a/schema/v4/schema.go +++ b/schema/v4/schema.go @@ -1,3 +1,9 @@ +// Package v4 provides a decoder for Jupyter Notebooks v4.0 and later minor versions. +// +// It implements the IPython Notebook v4.0 JSON Schema. Other minor versions can be decoded using the same, +// as the differences do not affect how the notebook is rendered. +// +// [IPython Notebook v4.0 JSON Schema]: https://github.com/jupyter/nbformat/blob/main/nbformat/v4/nbformat.v4.0.schema.json package v4 import ( @@ -19,9 +25,7 @@ func init() { decode.RegisterDecoder(schema.Version{Major: 4, Minor: 0}, new(decoder)) } -// decoder decodes cell contents and metadata for nbformat v4.4. -// Other versions can be decoded using the same, as their schema -// differs in ways that does not affect how the notebook is rendered. +// decoder decodes cell contents and metadata for nbformat v4.0. type decoder struct{} var _ decode.Decoder = (*decoder)(nil) @@ -78,51 +82,24 @@ func (nm *NotebookMetadata) Language() string { // Markdown defines the schema for a "markdown" cell. type Markdown struct { - Att Attachments `json:"attachments,omitempty"` - Source common.MultilineString `json:"source"` + common.Markdown + Att Attachments `json:"attachments,omitempty"` } -var _ schema.Cell = (*Markdown)(nil) var _ schema.HasAttachments = (*Markdown)(nil) -func (md *Markdown) Type() schema.CellType { - return schema.Markdown -} - -func (md *Markdown) MimeType() string { - return common.MarkdownText -} - -func (md *Markdown) Text() []byte { - return md.Source.Text() -} - func (md *Markdown) Attachments() schema.Attachments { return md.Att } // Raw defines the schema for a "raw" cell. type Raw struct { - Att Attachments `json:"attachments,omitempty"` - Source common.MultilineString `json:"source"` - Metadata RawCellMetadata `json:"metadata"` + common.Raw + Att Attachments `json:"attachments,omitempty"` } -var _ schema.Cell = (*Raw)(nil) var _ schema.HasAttachments = (*Raw)(nil) -func (raw *Raw) Type() schema.CellType { - return schema.Raw -} - -func (raw *Raw) MimeType() string { - return raw.Metadata.MimeType() -} - -func (raw *Raw) Text() []byte { - return raw.Source.Text() -} - func (raw *Raw) Attachments() schema.Attachments { return raw.Att } @@ -140,24 +117,6 @@ func (att Attachments) MimeBundle(filename string) schema.MimeBundle { return mb } -// RawCellMetadata may specify a target conversion format. -type RawCellMetadata struct { - Format *string `json:"format"` - RawMimeType *string `json:"raw_mimetype"` -} - -// MimeType returns a more specific mime-type if one is provided and "text/plain" otherwise. -func (raw *RawCellMetadata) MimeType() string { - switch { - case raw.Format != nil: - return *raw.Format - case raw.RawMimeType != nil: - return *raw.RawMimeType - default: - return common.PlainText - } -} - // Code defines the schema for a "code" cell. type Code struct { Source common.MultilineString `json:"source"` From c9684442360b4cba7e1342c5d85b1be4f49ef9e0 Mon Sep 17 00:00:00 2001 From: dyma solovei Date: Mon, 26 Feb 2024 18:01:26 +0100 Subject: [PATCH 4/6] feat: support v1.0 and v2.0 notebooks Turns out, v1 and v2 only differ in how Jupyter interprets them, not in the schema itself. We can use the same decoder we use for v3. --- decode/decode_test.go | 30 ++++++++++++++++++++++++++++++ schema/v3/schema.go | 4 +++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/decode/decode_test.go b/decode/decode_test.go index 7279145..e7f1f23 100644 --- a/decode/decode_test.go +++ b/decode/decode_test.go @@ -113,6 +113,36 @@ func TestDecodeBytes(t *testing.T) { }`, nCells: 3, }, + { + name: "v2.0", + json: `{ + "nbformat": 2, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + {"cell_type": "markdown", "metadata": {}, "source": []}, + {"cell_type": "markdown", "metadata": {}, "source": []} + ]}, + {"cells": [ + {"cell_type": "markdown", "metadata": {}, "source": []} + ]} + ] + }`, + nCells: 3, + }, + { + name: "v1.0", + json: `{ + "nbformat": 1, "nbformat_minor": 0, "metadata": {}, "worksheets": [ + {"cells": [ + {"cell_type": "markdown", "metadata": {}, "source": []}, + {"cell_type": "markdown", "metadata": {}, "source": []} + ]}, + {"cells": [ + {"cell_type": "markdown", "metadata": {}, "source": []} + ]} + ] + }`, + nCells: 3, + }, } { t.Run(tt.name, func(t *testing.T) { nb, err := decode.Bytes([]byte(tt.json)) diff --git a/schema/v3/schema.go b/schema/v3/schema.go index 6772ecb..92654ac 100644 --- a/schema/v3/schema.go +++ b/schema/v3/schema.go @@ -20,9 +20,11 @@ import ( func init() { decode.RegisterDecoder(schema.Version{Major: 3, Minor: 0}, new(decoder)) + decode.RegisterDecoder(schema.Version{Major: 2, Minor: 0}, new(decoder)) + decode.RegisterDecoder(schema.Version{Major: 1, Minor: 0}, new(decoder)) } -// decoder decodes cell contents and metadata for nbformat v3.0. +// decoder decodes cell contents and metadata for nbformat v3.0, v2.0, and v1.0. type decoder struct{} var _ decode.Decoder = (*decoder)(nil) From e207fd553b21dd742d106ce1885ede7be25155c1 Mon Sep 17 00:00:00 2001 From: dyma solovei Date: Mon, 26 Feb 2024 18:05:34 +0100 Subject: [PATCH 5/6] chore: update version.go to reflect current release version --- version.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.go b/version.go index 8e26083..2a4b750 100644 --- a/version.go +++ b/version.go @@ -8,5 +8,5 @@ import ( // Version returns current release version. func Version() string { - return "v0.2.0" + return "v0.2.1" } From 8371047b42a4ef5bc5988d7e3f861750f859c8e9 Mon Sep 17 00:00:00 2001 From: dyma solovei Date: Mon, 26 Feb 2024 18:12:54 +0100 Subject: [PATCH 6/6] refactor: create 1 decoder instance per package This has no logic implications, but it feels like multiple instances are unnecessary --- schema/v3/schema.go | 7 ++++--- schema/v4/schema.go | 13 +++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/schema/v3/schema.go b/schema/v3/schema.go index 92654ac..c50fb45 100644 --- a/schema/v3/schema.go +++ b/schema/v3/schema.go @@ -19,9 +19,10 @@ import ( ) func init() { - decode.RegisterDecoder(schema.Version{Major: 3, Minor: 0}, new(decoder)) - decode.RegisterDecoder(schema.Version{Major: 2, Minor: 0}, new(decoder)) - decode.RegisterDecoder(schema.Version{Major: 1, Minor: 0}, new(decoder)) + d := new(decoder) + decode.RegisterDecoder(schema.Version{Major: 3, Minor: 0}, d) + decode.RegisterDecoder(schema.Version{Major: 2, Minor: 0}, d) + decode.RegisterDecoder(schema.Version{Major: 1, Minor: 0}, d) } // decoder decodes cell contents and metadata for nbformat v3.0, v2.0, and v1.0. diff --git a/schema/v4/schema.go b/schema/v4/schema.go index fb64e55..c13a7c2 100644 --- a/schema/v4/schema.go +++ b/schema/v4/schema.go @@ -17,12 +17,13 @@ import ( ) func init() { - decode.RegisterDecoder(schema.Version{Major: 4, Minor: 5}, new(decoder)) - decode.RegisterDecoder(schema.Version{Major: 4, Minor: 4}, new(decoder)) - decode.RegisterDecoder(schema.Version{Major: 4, Minor: 3}, new(decoder)) - decode.RegisterDecoder(schema.Version{Major: 4, Minor: 2}, new(decoder)) - decode.RegisterDecoder(schema.Version{Major: 4, Minor: 1}, new(decoder)) - decode.RegisterDecoder(schema.Version{Major: 4, Minor: 0}, new(decoder)) + d := new(decoder) + decode.RegisterDecoder(schema.Version{Major: 4, Minor: 5}, d) + decode.RegisterDecoder(schema.Version{Major: 4, Minor: 4}, d) + decode.RegisterDecoder(schema.Version{Major: 4, Minor: 3}, d) + decode.RegisterDecoder(schema.Version{Major: 4, Minor: 2}, d) + decode.RegisterDecoder(schema.Version{Major: 4, Minor: 1}, d) + decode.RegisterDecoder(schema.Version{Major: 4, Minor: 0}, d) } // decoder decodes cell contents and metadata for nbformat v4.0.