From a3e3e61cbef0e7bd93888df1bc59b0b50c4a4a7f Mon Sep 17 00:00:00 2001 From: Patrick Dubroy Date: Fri, 3 Mar 2023 09:43:12 +0100 Subject: [PATCH 1/4] feat!: Make `any` consume a full code point, not a single code unit BREAKING CHANGE: this changes the meaning of `any` in user grammars --- doc/syntax-reference.md | 4 +++- packages/ohm-js/src/pexprs-eval.js | 6 +++--- packages/ohm-js/test/test-ohm-syntax.js | 22 ++++++++++++++++++++++ 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/doc/syntax-reference.md b/doc/syntax-reference.md index 552cd6db..d50bc788 100644 --- a/doc/syntax-reference.md +++ b/doc/syntax-reference.md @@ -146,7 +146,9 @@ as well as multiline (`/* */`) comments like: (See [src/built-in-rules.ohm](https://github.com/harc/ohm/blob/main/packages/ohm-js/src/built-in-rules.ohm).) -`any`: Matches the next character in the input stream, if one exists. +`any`: Matches the next Unicode character β€” i.e., a single code point β€”Β in the input stream, if one exists. + +**NOTE:** A JavaScript string is a sequence of 16-bit _code units_. Some Unicode characters, such as emoji, are encoded as pairs of 16-bit values. For example, the string `'πŸ˜†'` has length 2, but contains a single Unicode code point. Prior to Ohm v17, `any` always consumed a single 16-bit code unit, rather than a full Unicode character. `letter`: Matches a single character which is a letter (either uppercase or lowercase). diff --git a/packages/ohm-js/src/pexprs-eval.js b/packages/ohm-js/src/pexprs-eval.js index 231c7fff..32f74727 100644 --- a/packages/ohm-js/src/pexprs-eval.js +++ b/packages/ohm-js/src/pexprs-eval.js @@ -29,9 +29,9 @@ pexprs.PExpr.prototype.eval = common.abstract('eval'); // function(state) { ... pexprs.any.eval = function(state) { const {inputStream} = state; const origPos = inputStream.pos; - const ch = inputStream.next(); - if (ch) { - state.pushBinding(new TerminalNode(ch.length), origPos); + const cp = inputStream.nextCodePoint(); + if (cp !== undefined) { + state.pushBinding(new TerminalNode(String.fromCodePoint(cp).length), origPos); return true; } else { state.processFailure(origPos, this); diff --git a/packages/ohm-js/test/test-ohm-syntax.js b/packages/ohm-js/test/test-ohm-syntax.js index 1f3c60fc..e619ccba 100644 --- a/packages/ohm-js/test/test-ohm-syntax.js +++ b/packages/ohm-js/test/test-ohm-syntax.js @@ -256,6 +256,28 @@ test('ranges w/ code points > 0xFFFF, special cases', t => { assertSucceeds(t, g2.match('\u{D83D}x')); }); +test('any consumes an entire code point', t => { + const g = ohm.grammar('G { start = any any }'); + const re = /../u; // The regex equivalent of `any any`. + + t.is('πŸ˜‡'.length, 2); + t.is('πŸ˜‡!'.length, 3); + t.is('πŸ˜‡πŸ˜‡'.length, 4); + + t.is(g.match('πŸ˜‡πŸ˜‡').succeeded(), true); + t.truthy(re.exec('πŸ˜‡πŸ˜‡')); + + t.is(g.match('πŸ˜‡!').succeeded(), true); + t.truthy(re.exec('πŸ˜‡!')); + + t.is(g.match('!πŸ˜‡').succeeded(), true); + t.truthy(re.exec('!πŸ˜‡')); + + t.is('πŸ‘‹πŸΏ'.length, 4); // Skin color modifier is a separate code point. + t.is(g.match('πŸ‘‹πŸΏ').succeeded(), true); + t.truthy(re.exec('πŸ‘‹πŸΏ')); +}); + describe('alt', test => { const m = ohm.grammar('M { altTest = "a" | "b" }'); const s = m.createSemantics().addAttribute('v', { From e0a70a14b6733af3673c7d0b06ac844af4d1b6b8 Mon Sep 17 00:00:00 2001 From: Patrick Dubroy Date: Fri, 3 Mar 2023 09:46:58 +0100 Subject: [PATCH 2/4] Add to changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a64291ff..f6b67f21 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Breaking changes: +- [#424]: `any` now consumes an entire code point (i.e., a full Unicode character), not just a single, 16-bit code unit. - [55c787b]: The namespace helpers (`namespace`, `extendNamespace`) have been removed. (These were always optional.) - [bea0be9]: When used as an ES module, the main 'ohm-js' module now has *only* named exports (i.e., no default export). The same is true for `ohm-js/extras`. - [#395]: In generated type definitions, action dictionary types now inherit from `BaseActionDict`, a new supertype of `ActionDict`. From 658522c3856c566907864c9c8138ee876fbefaa6 Mon Sep 17 00:00:00 2001 From: Patrick Dubroy Date: Fri, 3 Mar 2023 10:03:14 +0100 Subject: [PATCH 3/4] Add to release notes --- doc/releases/ohm-js-17.0.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/doc/releases/ohm-js-17.0.md b/doc/releases/ohm-js-17.0.md index cb43bba5..02622092 100644 --- a/doc/releases/ohm-js-17.0.md +++ b/doc/releases/ohm-js-17.0.md @@ -2,6 +2,24 @@ ## Upgrading +### `any` now consumes a full code point + +In JavaScript, a string is a sequence of 16-bit code units. Some Unicode characters, such as emoji, are encoded as pairs of 16-bit values. For example, the string 'πŸ˜†' has length 2, but contains a single Unicode code point. Previously, `any` matched a single 16-bit code unit β€” even if that unit was part of a surrogate pair. In v17, `any` now matches a full Unicode character. + +Old behaviour: + +```js +const g = ohm.grammar('OneChar { start = any }'); +g.match('πŸ˜†').succeeded(); // false +``` + +New behaviour (Ohm v17+): + +```js +const g = ohm.grammar('OneChar { start = any }'); +g.match('πŸ˜†').succeeded(); // true +``` + ### Namespace helpers removed The top-level `namespace` and `extendNamespace` functions have been removed. They were never required β€” it was always possible to use a plain old object in any API that asked for a namespace. From b328e96a0c7a9e46ab76b0cf4330d749fd2503bd Mon Sep 17 00:00:00 2001 From: Patrick Dubroy Date: Fri, 3 Mar 2023 16:12:28 +0100 Subject: [PATCH 4/4] chore(ohm-js): Bump version after merging missing PR --- packages/ohm-js/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/ohm-js/package.json b/packages/ohm-js/package.json index 0f811d2a..faa66e30 100644 --- a/packages/ohm-js/package.json +++ b/packages/ohm-js/package.json @@ -1,6 +1,6 @@ { "name": "ohm-js", - "version": "17.0.0", + "version": "17.0.1", "description": "An object-oriented language for parsing and pattern matching", "repository": "https://github.com/harc/ohm", "keywords": [