Skip to content

Commit

Permalink
Merge pull request unicode-rs#96 from Jules-Bertholet/tables-fmt
Browse files Browse the repository at this point in the history
Ensure `tables.rs` passes rustfmt
  • Loading branch information
Manishearth authored Mar 4, 2024
2 parents c49e96f + 42fd2c1 commit ac8fa20
Show file tree
Hide file tree
Showing 5 changed files with 9,928 additions and 25,227 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Regen
run: cd scripts && python3 unicode.py
- name: Diff tables
Expand Down
59 changes: 27 additions & 32 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# out-of-line and check the tables.rs and normalization_tests.rs files into git.
import collections
import urllib.request
from itertools import batched

UNICODE_VERSION = "15.1.0"
UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION
Expand Down Expand Up @@ -354,20 +355,26 @@ def is_first_and_last(first, last):
return False
return first[1:-8] == last[1:-7]

def gen_mph_data(name, d, kv_type, kv_callback):
def gen_mph_data(name, d, kv_type, kv_callback, kv_row_width):
(salt, keys) = minimal_perfect_hash(d)
out.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name.upper())
for s in salt:
out.write(" 0x{:x},\n".format(s))
out.write(f"\npub(crate) const {name.upper()}_SALT: &[u16] = &[\n")
for s_row in batched(salt, 13):
out.write(" ")
for s in s_row:
out.write(f" 0x{s:03X},")
out.write("\n")
out.write("];\n")
out.write(f"pub(crate) const {name.upper()}_KV: &[{kv_type}] = &[\n")
for k_row in batched(keys, kv_row_width):
out.write(" ")
for k in k_row:
out.write(f" {kv_callback(k)},")
out.write("\n")
out.write("];\n")
out.write("pub(crate) const {}_KV: &[{}] = &[\n".format(name.upper(), kv_type))
for k in keys:
out.write(" {},\n".format(kv_callback(k)))
out.write("];\n\n")

def gen_combining_class(combining_classes, out):
gen_mph_data('canonical_combining_class', combining_classes, 'u32',
lambda k: "0x{:X}".format(int(combining_classes[k]) | (k << 8)))
lambda k: f"0x{int(combining_classes[k]) | (k << 8):07X}", 8)

def gen_composition_table(canon_comp, out):
table = {}
Expand All @@ -376,7 +383,7 @@ def gen_composition_table(canon_comp, out):
table[(c1 << 16) | c2] = c3
(salt, keys) = minimal_perfect_hash(table)
gen_mph_data('COMPOSITION_TABLE', table, '(u32, char)',
lambda k: "(0x%s, '\\u{%s}')" % (hexify(k), hexify(table[k])))
lambda k: f"(0x{k:08X}, '\\u{{{table[k]:06X}}}')", 1)

out.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n")
out.write(" match (c1, c2) {\n")
Expand All @@ -403,7 +410,7 @@ def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_de
assert offset < 65536
out.write("];\n")
gen_mph_data(name + '_decomposed', table, "(u32, (u16, u16))",
lambda k: "(0x{:x}, ({}, {}))".format(k, offsets[k], len(table[k])))
lambda k: f"(0x{k:05X}, (0x{offsets[k]:03X}, 0x{len(table[k]):X}))", 1)

def gen_qc_match(prop_table, out):
out.write(" match c {\n")
Expand All @@ -412,7 +419,7 @@ def gen_qc_match(prop_table, out):
assert data in ('N', 'M')
result = "No" if data == 'N' else "Maybe"
if high:
out.write(r" '\u{%s}'...'\u{%s}' => %s," % (low, high, result))
out.write(r" '\u{%s}'..='\u{%s}' => %s," % (low, high, result))
else:
out.write(r" '\u{%s}' => %s," % (low, result))
out.write("\n")
Expand All @@ -421,7 +428,7 @@ def gen_qc_match(prop_table, out):
out.write(" }\n")

def gen_nfc_qc(prop_tables, out):
out.write("#[inline]\n")
out.write("\n#[inline]\n")
out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
out.write("pub fn qc_nfc(c: char) -> IsNormalized {\n")
gen_qc_match(prop_tables['NFC_QC'], out)
Expand Down Expand Up @@ -450,7 +457,7 @@ def gen_nfkd_qc(prop_tables, out):

def gen_combining_mark(general_category_mark, out):
gen_mph_data('combining_mark', general_category_mark, 'u32',
lambda k: '0x{:04x}'.format(k))
lambda k: '0x{:05X}'.format(k), 10)

def gen_public_assigned(general_category_public_assigned, out):
# This could be done as a hash but the table is somewhat small.
Expand All @@ -464,17 +471,16 @@ def gen_public_assigned(general_category_public_assigned, out):
out.write(" ")
start = False
else:
out.write(" | ")
out.write("\n | ")
if first == last:
out.write("'\\u{%s}'\n" % hexify(first))
out.write("'\\u{%s}'" % hexify(first))
else:
out.write("'\\u{%s}'..='\\u{%s}'\n" % (hexify(first), hexify(last)))
out.write(" => true,\n")
out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last)))
out.write(" => true,\n")

out.write(" _ => false,\n")
out.write(" }\n")
out.write("}\n")
out.write("\n")

def gen_stream_safe(leading, trailing, out):
# This could be done as a hash but the table is very small.
Expand All @@ -488,10 +494,9 @@ def gen_stream_safe(leading, trailing, out):
out.write(" _ => 0,\n")
out.write(" }\n")
out.write("}\n")
out.write("\n")

gen_mph_data('trailing_nonstarters', trailing, 'u32',
lambda k: "0x{:X}".format(int(trailing[k]) | (k << 8)))
lambda k: f"0x{int(trailing[k]) | (k << 8):07X}", 8)

def gen_tests(tests, out):
out.write("""#[derive(Debug)]
Expand Down Expand Up @@ -579,43 +584,33 @@ def minimal_perfect_hash(d):
data = UnicodeData()
with open("tables.rs", "w", newline = "\n") as out:
out.write(PREAMBLE)
out.write("#![cfg_attr(rustfmt, rustfmt::skip)]\n")
out.write("use crate::quick_check::IsNormalized;\n")
out.write("use crate::quick_check::IsNormalized::*;\n")
out.write("\n")

version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split("."))
out.write("#[allow(unused)]\n")
out.write("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n\n" % version)
out.write("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n" % version)

gen_combining_class(data.combining_classes, out)
out.write("\n")

gen_composition_table(data.canon_comp, out)
out.write("\n")

gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.cjk_compat_variants_fully_decomp, out)

gen_combining_mark(data.general_category_mark, out)
out.write("\n")

gen_public_assigned(data.general_category_public_assigned, out)
out.write("\n")

gen_nfc_qc(data.norm_props, out)
out.write("\n")

gen_nfkc_qc(data.norm_props, out)
out.write("\n")

gen_nfd_qc(data.norm_props, out)
out.write("\n")

gen_nfkd_qc(data.norm_props, out)
out.write("\n")

gen_stream_safe(data.ss_leading, data.ss_trailing, out)
out.write("\n")

with open("normalization_tests.rs", "w", newline = "\n") as out:
out.write(PREAMBLE)
Expand Down
2 changes: 0 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,6 @@ mod quick_check;
mod recompose;
mod replace;
mod stream_safe;

#[rustfmt::skip]
mod tables;

#[doc(hidden)]
Expand Down
10 changes: 5 additions & 5 deletions src/normalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,8 @@ const L_LAST: u32 = L_BASE + L_COUNT - 1;
const V_LAST: u32 = V_BASE + V_COUNT - 1;
const T_LAST: u32 = T_BASE + T_COUNT - 1;

// Composition only occurs for `TPart`s in `U+11A8 ... U+11C2`,
// i.e. `T_BASE + 1 ... T_LAST`.
// Composition only occurs for `TPart`s in `U+11A8 ..= U+11C2`,
// i.e. `T_BASE + 1 ..= T_LAST`.
const T_FIRST: u32 = T_BASE + 1;

pub(crate) fn is_hangul_syllable(c: char) -> bool {
Expand Down Expand Up @@ -172,15 +172,15 @@ fn compose_hangul(a: char, b: char) -> Option<char> {
let (a, b) = (a as u32, b as u32);
match (a, b) {
// Compose a leading consonant and a vowel together into an LV_Syllable
(L_BASE...L_LAST, V_BASE...V_LAST) => {
(L_BASE..=L_LAST, V_BASE..=V_LAST) => {
let l_index = a - L_BASE;
let v_index = b - V_BASE;
let lv_index = l_index * N_COUNT + v_index * T_COUNT;
let s = S_BASE + lv_index;
Some(unsafe { char::from_u32_unchecked(s) })
}
// Compose an LV_Syllable and a trailing consonant into an LVT_Syllable
(S_BASE...S_LAST, T_FIRST...T_LAST) if (a - S_BASE) % T_COUNT == 0 => {
(S_BASE..=S_LAST, T_FIRST..=T_LAST) if (a - S_BASE) % T_COUNT == 0 => {
Some(unsafe { char::from_u32_unchecked(a + (b - T_BASE)) })
}
_ => None,
Expand All @@ -193,7 +193,7 @@ mod tests {

// Regression test from a bugfix where we were composing an LV_Syllable with
// T_BASE directly. (We should only compose an LV_Syllable with a character
// in the range `T_BASE + 1 ... T_LAST`.)
// in the range `T_BASE + 1 ..= T_LAST`.)
#[test]
fn test_hangul_composition() {
assert_eq!(compose_hangul('\u{c8e0}', '\u{11a7}'), None);
Expand Down
Loading

0 comments on commit ac8fa20

Please sign in to comment.