From 3201e86cf80b949a7a772287b9c46adabf6f57d6 Mon Sep 17 00:00:00 2001 From: Martin Tournoij Date: Fri, 2 Jun 2023 19:35:56 +0200 Subject: [PATCH] Backout Unicode bare keys This backs out the unicode bare keys from #891. This does *not* mean we can't include it in a future 1.2 (or 1.3, or whatever); just that right now there doesn't seem to be a clear consensus regarding to normalisation and which characters to include. It's already the most discussed single issue in the history of TOML. I kind of hate doing this as it seems a step backwards; in principle I think we *should* have this so I'm not against the idea of the feature as such, but things seem to be at a bit of a stalemate right now, and this will allow TOML to move forward on other fronts. It hasn't come up *that* often; the issue (#687) wasn't filed until 2019, and has only 11 upvotes. Other than that, the issue was raised only once before in 2015 as far as I can find (#337). I also can't really find anyone asking for it in any of the HN threads on TOML. Reverting this means we can go forward releasing TOML 1.1, giving people access to the much more frequently requested relaxing of inline tables (#516, with 122 upvotes, and has come up on HN as well) and some other more minor things (e.g. `\e` has 12 upvotes in #715). Basically, a lot more people are waiting for this, and all things considered this seems a better path forward for now, unless someone comes up with a proposal which addresses all issues (I tried and thus far failed). I proposed this over here a few months ago, and the responses didn't seem too hostile to the idea: https://github.com/toml-lang/toml/issues/966#issuecomment-1469883434 --- CHANGELOG.md | 1 - toml.abnf | 14 +------------- toml.md | 25 +++++++------------------ 3 files changed, 8 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14d3cebc..0650637a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,6 @@ - Add new `\e` shorthand for the escape character. - Add \x00 notation to basic strings. - Seconds in Date-Time and Time values are now optional. -- Allow non-English scripts in unquoted (bare) keys - Clarify newline normalization in multi-line literal strings. ## 1.0.0 / 2021-01-11 diff --git a/toml.abnf b/toml.abnf index 0446f8b6..d788fb17 100644 --- a/toml.abnf +++ b/toml.abnf @@ -49,19 +49,7 @@ key = simple-key / dotted-key val = string / boolean / array / inline-table / date-time / float / integer simple-key = quoted-key / unquoted-key - -;; Unquoted key - -unquoted-key = 1*unquoted-key-char -unquoted-key-char = ALPHA / DIGIT / %x2D / %x5F ; a-z A-Z 0-9 - _ -unquoted-key-char =/ %xB2 / %xB3 / %xB9 / %xBC-BE ; superscript digits, fractions -unquoted-key-char =/ %xC0-D6 / %xD8-F6 / %xF8-37D ; non-symbol chars in Latin block -unquoted-key-char =/ %x37F-1FFF ; exclude GREEK QUESTION MARK, which is basically a semi-colon -unquoted-key-char =/ %x200C-200D / %x203F-2040 ; from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ -unquoted-key-char =/ %x2070-218F / %x2460-24FF ; include super-/subscripts, letterlike/numberlike forms, enclosed alphanumerics -unquoted-key-char =/ %x2C00-2FEF / %x3001-D7FF ; skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces -unquoted-key-char =/ %xF900-FDCF / %xFDF0-FFFD ; skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) -unquoted-key-char =/ %x10000-EFFFF ; all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) +unquoted-key = 1*( ALPHA / DIGIT / %x2D / %x5F ) ; A-Z / a-z / 0-9 / - / _ ;; Quoted and dotted key diff --git a/toml.md b/toml.md index 08cc9f67..320040c4 100644 --- a/toml.md +++ b/toml.md @@ -103,11 +103,9 @@ first = "Tom" last = "Preston-Werner" # INVALID A key may be either bare, quoted, or dotted. -**Bare keys** may contain any letter-like or number-like Unicode character from -any Unicode script, as well as ASCII digits, dashes and underscores. -Punctuation, spaces, arrows, box drawing and private use characters are not -allowed. Note that bare keys are allowed to be composed of only ASCII digits, -e.g. 1234, but are always interpreted as strings. +**Bare keys** may only contain ASCII letters, ASCII digits, underscores, and +dashes (`A-Za-z0-9_-`). Note that bare keys are allowed to be composed of only +ASCII digits, e.g. `1234`, but are always interpreted as strings. ℹ️ The exact ranges of allowed code points can be found in the [ABNF grammar file][abnf]. @@ -117,23 +115,18 @@ key = "value" bare_key = "value" bare-key = "value" 1234 = "value" -Fuß = "value" -😂 = "value" -汉语大字典 = "value" -辭源 = "value" -பெண்டிரேம் = "value" ``` **Quoted keys** follow the exact same rules as either basic strings or literal -strings and allow you to use any Unicode character in a key name, including -spaces. Best practice is to use bare keys except when absolutely necessary. +strings and allow you to use a much broader set of key names. Best practice is +to use bare keys except when absolutely necessary. ```toml "127.0.0.1" = "value" "character encoding" = "value" +"ʎǝʞ" = "value" +'key2' = "value" 'quoted "value"' = "value" -"╠═╣" = "value" -"⋰∫∬∭⋱" = "value" ``` A bare key must be non-empty, but an empty quoted key is allowed (though @@ -154,7 +147,6 @@ name = "Orange" physical.color = "orange" physical.shape = "round" site."google.com" = true -பெண்.டிரேம் = "we are women" ``` In JSON land, that would give you the following structure: @@ -168,9 +160,6 @@ In JSON land, that would give you the following structure: }, "site": { "google.com": true - }, - "பெண்": { - "டிரேம்": "we are women" } } ```