Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(developer,common): verify normalization of strings 🙀 #12748

Merged
merged 6 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions common/web/types/src/kmx/kmx-plus/kmx-plus.ts
Original file line number Diff line number Diff line change
Expand Up @@ -174,12 +174,7 @@ export class Strs extends Section {
*/
allocString(s?: string, opts?: StrsOptions, sections?: DependencySections): StrsItem {
// Run the string processing pipeline
s = Strs.processString(s, opts, sections);

// add to the set, for testing
if (s) {
this.allProcessedStrings.add(s);
}
s = this.processString(s, opts, sections);

// if it's a single char, don't push it into the strs table
if (opts?.singleOk && isOneChar(s)) {
Expand All @@ -196,8 +191,8 @@ export class Strs extends Section {
return result;
}

/** process everything according to opts */
static processString(s: string, opts: StrsOptions, sections: DependencySections) {
/** process everything according to opts, and add the string to this.allProcessedStrings */
private processString(s: string, opts: StrsOptions, sections: DependencySections) {
s = s ?? '';
// type check everything else
if (typeof s !== 'string') {
Expand All @@ -215,6 +210,12 @@ export class Strs extends Section {
if (opts?.unescape) {
s = unescapeString(s);
}

if (s) {
// add all processed strings here, so that we catch denormalized strings in the input
this.allProcessedStrings.add(s);
}

// nfd
if (opts?.nfd) {
if (!sections?.meta?.normalizationDisabled) {
Expand Down
9 changes: 9 additions & 0 deletions common/web/types/src/util/util.ts
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,15 @@ export function isPUA(ch: number) {
(ch >= Uni_PUA_16_START && ch <= Uni_PUA_16_END));
}

/** @returns false if s is NEITHER NFC nor NFD. (Returns true for falsy) */
export function isNormalized(s: string) : boolean {
if(!s) return true; // empty or null
const nfc = s.normalize("NFC");
const nfd = s.normalize("NFD");
if (s !== nfc && s !== nfd) return false;
return true;
}

class BadStringMap extends Map<BadStringType, Set<number>> {
public toString() : string {
if (!this.size) {
Expand Down
20 changes: 19 additions & 1 deletion common/web/types/tests/util/unescape.tests.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import 'mocha';
import {assert} from 'chai';
import {unescapeString, UnescapeError, isOneChar, toOneChar, unescapeOneQuadString, BadStringAnalyzer, isValidUnicode, describeCodepoint, isPUA, BadStringType, unescapeStringToRegex, unescapeQuadString, NFDAnalyzer} from '../../src/util/util.js';
import {unescapeString, UnescapeError, isOneChar, toOneChar, unescapeOneQuadString, BadStringAnalyzer, isValidUnicode, describeCodepoint, isPUA, BadStringType, unescapeStringToRegex, unescapeQuadString, NFDAnalyzer, isNormalized} from '../../src/util/util.js';

describe('test UTF32 functions()', function() {
it('should properly categorize strings', () => {
Expand Down Expand Up @@ -186,6 +186,24 @@ describe('test bad char functions', () => {
assert.isTrue(isPUA(ch), describeCodepoint(ch));
}
});
describe('test isDenormalized()', () => {
it('should correctly categorize strings', () => {
[
undefined,
null,
'',
'ABC',
'fa\u1E69cinating', // NFC
'fas\u0323\u0307cinating', // NFD
'd\u0323\u0307', // NFD
'\u1e0d\u0307', // NFC
].map(s => assert.isTrue(isNormalized(s), `for string ${s}`));
[
'd\u0307\u0323', // NFD but reversed marks
'fas\u0307\u0323cinating', // not-NFD
].map(s => assert.isFalse(isNormalized(s), `for string ${s}`));
});
});
});

describe('test BadStringAnalyzer', () => {
Expand Down
4 changes: 4 additions & 0 deletions developer/src/kmc-ldml/src/compiler/empty-compiler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ export class StrsCompiler extends EmptyCompiler {
const badStringAnalyzer = new util.BadStringAnalyzer();
const CONTAINS_MARKER_REGEX = new RegExp(LdmlKeyboardTypes.MarkerParser.ANY_MARKER_MATCH);
for (let s of strs.allProcessedStrings.values()) {
// stop at the first denormalized string
if (!util.isNormalized(s)) {
this.callbacks.reportMessage(LdmlCompilerMessages.Warn_StringDenorm({s}));
}
// replace all \\uXXXX with the actual code point.
// this lets us analyze whether there are PUA, unassigned, etc.
// the results might not be valid regex of course.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,11 @@ export class LdmlCompilerMessages {
`Invalid marker identifier "\m{${def(o.id)}}". Identifiers must be between 1 and 32 characters, and can use A-Z, a-z, 0-9, and _.`,
);

// Available: 0x02B-0x2F
static WARN_StringDenorm = SevWarn | 0x002B;
static Warn_StringDenorm = (o: { s: string }) =>
m(this.WARN_StringDenorm, `File contains string "${def(o.s)}" that is neither NFC nor NFD.`);

// Available: 0x02C-0x2F

static ERROR_InvalidQuadEscape = SevError | 0x0030;
static Error_InvalidQuadEscape = (o: { cp: number }) =>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<?xml version="1.0" encoding="UTF-8"?>

<keyboard3 xmlns="https://schemas.unicode.org/cldr/45/keyboard3" locale="mt" conformsTo="45">
<version number="1.0.0" />

<info author="srl295" indicator="🙀" layout="qwerty" name="TestKbd" />

<keys>
<key id="ess" output="s\u{0307}\u{0323}" /> <!-- not NFD-->
</keys>

<layers formId="us" minDeviceWidth="123">
<layer id="zz">
<row keys="ess" />
</layer>
</layers>

</keyboard3>
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<?xml version="1.0" encoding="UTF-8"?>

<keyboard3 xmlns="https://schemas.unicode.org/cldr/45/keyboard3" locale="mt" conformsTo="45">
<version number="1.0.0" />

<info author="srl295" indicator="🙀" layout="qwerty" name="TestKbd" />

<keys>
<key id="ess" output="${ess}" />
</keys>

<variables>
<string id="ess" value="s\u{0307}\u{0323}" /> <!-- Not NFD -->
</variables>

<layers formId="us" minDeviceWidth="123">
<layer id="zz">
<row keys="ess" />
</layer>
</layers>

</keyboard3>
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<?xml version="1.0" encoding="UTF-8"?>

<keyboard3 xmlns="https://schemas.unicode.org/cldr/45/keyboard3" locale="mt" conformsTo="45">
<version number="1.0.0" />

<info author="srl295" indicator="🙀" layout="qwerty" name="TestKbd" />

<displays>
<display keyId="ess" display="s\u{0307}\u{0323}" /> <!-- not NFD-->
</displays>

<keys>
<key id="ess" output="${ess}" />
</keys>

<variables>
<string id="ess" value="S" />
</variables>

<layers formId="us" minDeviceWidth="123">
<layer id="zz">
<row keys="ess" />
</layer>
</layers>

</keyboard3>
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<?xml version="1.0" encoding="UTF-8"?>

<keyboard3 xmlns="https://schemas.unicode.org/cldr/45/keyboard3" locale="mt" conformsTo="45">
<version number="1.0.0" />

<info author="srl295" indicator="🙀" layout="qwerty" name="TestKbd" />


<keys>
<key id="ess" output="S" />
</keys>


<layers formId="us" minDeviceWidth="123">
<layer id="zz">
<row keys="ess" />
</layer>
</layers>

<transforms type="simple">
<transformGroup>
<transform from="S" to="s\u{0307}\u{0323}" /> <!-- Not NFD-->
</transformGroup>
</transforms>

</keyboard3>
21 changes: 21 additions & 0 deletions developer/src/kmc-ldml/test/strs.tests.ts
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,25 @@ describe('strs', function () {
]);
assert.isNull(kmx); // should fail post-validate
});
describe('should warn on denorm strings', async function () {
for (const num of [1, 2, 3, 4]) {
const path = `sections/strs/warn-denorm-${num}.xml`;
it(path, async function () {
const inputFilename = makePathToFixture(path);
const s = 's\u0307\u0323'; // ṩ
// Compile the keyboard
const kmx = await compileKeyboard(inputFilename, { ...compilerTestOptions, saveDebug: true, shouldAddCompilerVersion: false },
[
// validation messages
LdmlCompilerMessages.Warn_StringDenorm({s}),
],
false, // validation should pass
[
// same messages
LdmlCompilerMessages.Warn_StringDenorm({s}),
]);
assert.isNotNull(kmx); // not failing
});
}
});
});
Loading