diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 89c5f57..7731d4c 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -22,6 +22,9 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 + - uses: actions/setup-python@v5 + with: + python-version: '3.12' - name: Regen run: cd scripts && python3 unicode.py - name: Diff diff --git a/Cargo.toml b/Cargo.toml index a0f16da..ccd6c3e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,20 +2,23 @@ name = "unicode-width" version = "0.1.11" -authors = ["kwantam ", "Manish Goregaokar "] - +authors = [ + "kwantam ", + "Manish Goregaokar ", +] homepage = "https://github.com/unicode-rs/unicode-width" repository = "https://github.com/unicode-rs/unicode-width" documentation = "https://unicode-rs.github.io/unicode-width" license = "MIT/Apache-2.0" keywords = ["text", "width", "unicode"] readme = "README.md" +edition = "2021" description = """ Determine displayed width of `char` and `str` types according to Unicode Standard Annex #11 rules. """ -exclude = [ "target/*", "Cargo.lock" ] +exclude = ["target/*", "Cargo.lock"] [dependencies] std = { version = "1.0", package = "rustc-std-workspace-std", optional = true } diff --git a/scripts/unicode.py b/scripts/unicode.py index adda3d2..c2354d3 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -23,6 +23,8 @@ import os import re import sys +from collections import defaultdict +from itertools import batched NUM_CODEPOINTS = 0x110000 """An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace.""" @@ -389,9 +391,9 @@ def make_tables( return tables -def load_variation_sequences(width_map) -> "list[int]": +def load_variation_sequences() -> "list[int]": """Outputs a list of character ranages, corresponding to all the valid characters for starting - an emoji presentation sequence, exclusing those that are always wide.""" + an emoji presentation sequence.""" with fetch_open("emoji/emoji-variation-sequences.txt") as sequences: sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style") @@ -399,19 +401,68 @@ def load_variation_sequences(width_map) -> "list[int]": for line in sequences.readlines(): if match := sequence.match(line): cp = int(match.group(1), 16) - if width_map[cp] == EffectiveWidth.WIDE: - # this character would be width 2 even outside a variation sequence, - # so we don't need to store its info - continue codepoints.append(cp) return codepoints +def make_variation_sequence_table( + seqs: "list[int]", + width_map, +) -> "tuple[list[int], list[list[int]]]": + """Generates 2-level look up table for whether a codepoint might start an emoji presentation sequence. + (Characters that are always wide may be excluded.) + First level maps the most significant byte to a 4-bit index (or 0xFF if can't possibly start such a sequence), + second level is a bit array (each leaf is 512 bits long).""" + # The structure of the table currently relies on this. + # It's unlikely to be a problem in the near future + # as this is enough to encompass the entire Basic Multilingual Plane and + # Supplementary Multilingual Plane. + # And the fix is easy if it ever does become a problem: + # just check bits 1 more significant for the index, + # and use 1024-bit leaves instead of 512-bit. + assert seqs[-1] <= 0x1FFFF + + prefixes_dict = defaultdict(list) + for cp in seqs: + prefixes_dict[cp >> 9].append(cp & 0x1FF) + + # We don't strictly need to keep track of characters that are always wide, + # because being in an emoji variation seq won't affect their width. + # So store their info only when it wouldn't inflate the size of the tables. + keys = list(prefixes_dict.keys()) + for k in keys: + if all(map(lambda cp: width_map[(k << 9) | cp] == EffectiveWidth.WIDE, prefixes_dict[k])): + del prefixes_dict[k] + + # Another assumption made by the data structure. + # Ensures 4 bits are enough to index into subtable + assert len(prefixes_dict.keys()) <= 15 + index_nibbles = [0xF] * 256 + for idx, k in enumerate(prefixes_dict.keys()): + index_nibbles[k] = idx + + index = [] + for tup in batched(index_nibbles, 2): + next = 0 + for i in range(0, 2): + next |= tup[i] << (4 * i) + index.append(next) + + leaves = [] + for leaf_idx, cps in enumerate(prefixes_dict.values()): + leaf = [0] * 64 + for cp in cps: + idx_in_leaf, bit_shift = divmod(cp, 8) + leaf[idx_in_leaf] |= 1 << bit_shift + leaves.append(leaf) + return (index, leaves) + + def emit_module( out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]", - emoji_variations: "list[int]", + variation_table: "tuple[list[int], list[list[int]]]", ): """Outputs a Rust module to `out_name` using table data from `tables`. If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`. @@ -490,16 +541,33 @@ def emit_module( """ ) + variation_idx, variation_leaves = variation_table + module.write( - """ + f""" /// Whether this character forms an [emoji presentation sequence] /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) - /// when followed by `'\\u{FEOF}'`. + /// when followed by `'\\u{{FEOF}}'`. /// Emoji presentation sequences are considered to have width 2. + /// This may spuriously return `false` for all characters that are always wide. #[inline] - pub fn starts_emoji_presentation_seq(c: char) -> bool { - EMOJI_PRESENTATION_RANGES.binary_search(&c).is_ok() - } + pub fn starts_emoji_presentation_seq(c: char) -> bool {{ + let cp: u32 = c.into(); + let Ok(top_byte): Result = ((cp) >> 9).try_into() else {{ + return false; + }}; + + let index_byte = EMOJI_PRESENTATION_INDEX[usize::from(top_byte >> 1)]; + let index_nibble = (index_byte >> (4 * (top_byte & 1))) & 0xF; + if index_nibble >= {len(variation_leaves)} {{ + return false; + }} + + let leaf_byte = EMOJI_PRESENTATION_LEAVES[usize::from(index_nibble)] + [usize::try_from((cp >> 3) & 0x3F).unwrap()]; + + ((leaf_byte >> (cp & 7)) & 1) == 1 + }} """ ) @@ -556,15 +624,36 @@ def emit_module( module.write( f""" - /// Each tuple corresponds to a range (inclusive at both ends) - /// of characters that can start an emoji presentation sequence. - static EMOJI_PRESENTATION_RANGES: [char; {len(emoji_variations)}] = [ + /// An array of 256 4-bit nibbles. Index with bytes 9-16 (where LSB is 0) + /// of the char you want to test. 0xF means it's not part of a presentation seq, + /// anything else means index into the next table. + static EMOJI_PRESENTATION_INDEX: [u8; {len(variation_idx)}] = [ """ ) - for cp in emoji_variations: - module.write(f" '\\u{{{cp:X}}}',\n") + for row in batched(variation_idx, 15): + module.write(" ") + for idx in row: + module.write(f" 0x{idx:02X},") + module.write("\n") module.write(" ];\n") + module.write( + f""" + /// Array of 512-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`) + /// bitmap with the 9 LSB of your codepoint to get whether it can start an emoji presentation seq. + static EMOJI_PRESENTATION_LEAVES: [[u8; 64]; {len(variation_leaves)}] = [ +""" + ) + for leaf in variation_leaves: + module.write(" [\n") + for row in batched(leaf, 14): + module.write(" ") + for entry in row: + module.write(f" 0x{entry:02X},") + module.write("\n") + module.write(" ],\n") + + module.write(" ];\n") module.write("}\n") @@ -574,6 +663,7 @@ def main(module_filename: str): `module_filename`. We obey the following rules in decreasing order of importance: + - Emoji presentation sequences are double-width. - The soft hyphen (`U+00AD`) is single-width. (https://archive.is/fCT3c) - Hangul jamo medial vowels & final consonants are zero-width. - All `Default_Ignorable_Code_Point`s are zero-width, except for U+115F HANGUL CHOSEONG FILLER. @@ -600,18 +690,26 @@ def main(module_filename: str): width_map[0x00AD] = EffectiveWidth.NARROW tables = make_tables(TABLE_CFGS, enumerate(width_map)) - emoji_variations = load_variation_sequences(width_map) + + emoji_variations = load_variation_sequences() + variation_table = make_variation_sequence_table(emoji_variations, width_map) print("------------------------") total_size = 0 for i, table in enumerate(tables): size_bytes = len(table.to_bytes()) - print(f"Table {i} Size: {size_bytes} bytes") + print(f"Table {i} size: {size_bytes} bytes") total_size += size_bytes + emoji_index_size = len(variation_table[0]) + print(f"Emoji Presentation Index Size: {emoji_index_size} bytes") + total_size += emoji_index_size + emoji_leaves_size = len(variation_table[1]) * len(variation_table[1][0]) + print(f"Emoji Presentation Leaves Size: {emoji_leaves_size} bytes") + total_size += emoji_leaves_size print("------------------------") print(f" Total Size: {total_size} bytes") - emit_module(module_filename, version, tables, emoji_variations) + emit_module(module_filename, version, tables, variation_table) print(f'Wrote to "{module_filename}"') diff --git a/src/lib.rs b/src/lib.rs index 42e4fe4..0758d5f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -42,7 +42,8 @@ //! unicode-width = "0.1.5" //! ``` -#![deny(missing_docs, unsafe_code)] +#![forbid(unsafe_code)] +#![deny(missing_docs)] #![doc( html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png", html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png" diff --git a/src/tables.rs b/src/tables.rs index e6cb8c7..0297983 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -63,9 +63,24 @@ pub mod charwidth { /// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) /// when followed by `'\u{FEOF}'`. /// Emoji presentation sequences are considered to have width 2. + /// This may spuriously return `false` for all characters that are always wide. #[inline] pub fn starts_emoji_presentation_seq(c: char) -> bool { - EMOJI_PRESENTATION_RANGES.binary_search(&c).is_ok() + let cp: u32 = c.into(); + let Ok(top_byte): Result = ((cp) >> 9).try_into() else { + return false; + }; + + let index_byte = EMOJI_PRESENTATION_INDEX[usize::from(top_byte >> 1)]; + let index_nibble = (index_byte >> (4 * (top_byte & 1))) & 0xF; + if index_nibble >= 11 { + return false; + } + + let leaf_byte = EMOJI_PRESENTATION_LEAVES[usize::from(index_nibble)] + [usize::try_from((cp >> 3) & 0x3F).unwrap()]; + + ((leaf_byte >> (cp & 7)) & 1) == 1 } /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or @@ -547,221 +562,100 @@ pub mod charwidth { 0xFF, 0xFF, 0x5F, ]; - /// Each tuple corresponds to a range (inclusive at both ends) - /// of characters that can start an emoji presentation sequence. - static EMOJI_PRESENTATION_RANGES: [char; 213] = [ - '\u{23}', - '\u{2A}', - '\u{30}', - '\u{31}', - '\u{32}', - '\u{33}', - '\u{34}', - '\u{35}', - '\u{36}', - '\u{37}', - '\u{38}', - '\u{39}', - '\u{A9}', - '\u{AE}', - '\u{203C}', - '\u{2049}', - '\u{2122}', - '\u{2139}', - '\u{2194}', - '\u{2195}', - '\u{2196}', - '\u{2197}', - '\u{2198}', - '\u{2199}', - '\u{21A9}', - '\u{21AA}', - '\u{2328}', - '\u{23CF}', - '\u{23ED}', - '\u{23EE}', - '\u{23EF}', - '\u{23F1}', - '\u{23F2}', - '\u{23F8}', - '\u{23F9}', - '\u{23FA}', - '\u{24C2}', - '\u{25AA}', - '\u{25AB}', - '\u{25B6}', - '\u{25C0}', - '\u{25FB}', - '\u{25FC}', - '\u{2600}', - '\u{2601}', - '\u{2602}', - '\u{2603}', - '\u{2604}', - '\u{260E}', - '\u{2611}', - '\u{2618}', - '\u{261D}', - '\u{2620}', - '\u{2622}', - '\u{2623}', - '\u{2626}', - '\u{262A}', - '\u{262E}', - '\u{262F}', - '\u{2638}', - '\u{2639}', - '\u{263A}', - '\u{2640}', - '\u{2642}', - '\u{265F}', - '\u{2660}', - '\u{2663}', - '\u{2665}', - '\u{2666}', - '\u{2668}', - '\u{267B}', - '\u{267E}', - '\u{2692}', - '\u{2694}', - '\u{2695}', - '\u{2696}', - '\u{2697}', - '\u{2699}', - '\u{269B}', - '\u{269C}', - '\u{26A0}', - '\u{26A7}', - '\u{26B0}', - '\u{26B1}', - '\u{26C8}', - '\u{26CF}', - '\u{26D1}', - '\u{26D3}', - '\u{26E9}', - '\u{26F0}', - '\u{26F1}', - '\u{26F4}', - '\u{26F7}', - '\u{26F8}', - '\u{26F9}', - '\u{2702}', - '\u{2708}', - '\u{2709}', - '\u{270C}', - '\u{270D}', - '\u{270F}', - '\u{2712}', - '\u{2714}', - '\u{2716}', - '\u{271D}', - '\u{2721}', - '\u{2733}', - '\u{2734}', - '\u{2744}', - '\u{2747}', - '\u{2763}', - '\u{2764}', - '\u{27A1}', - '\u{2934}', - '\u{2935}', - '\u{2B05}', - '\u{2B06}', - '\u{2B07}', - '\u{1F170}', - '\u{1F171}', - '\u{1F17E}', - '\u{1F17F}', - '\u{1F321}', - '\u{1F324}', - '\u{1F325}', - '\u{1F326}', - '\u{1F327}', - '\u{1F328}', - '\u{1F329}', - '\u{1F32A}', - '\u{1F32B}', - '\u{1F32C}', - '\u{1F336}', - '\u{1F37D}', - '\u{1F396}', - '\u{1F397}', - '\u{1F399}', - '\u{1F39A}', - '\u{1F39B}', - '\u{1F39E}', - '\u{1F39F}', - '\u{1F3CB}', - '\u{1F3CC}', - '\u{1F3CD}', - '\u{1F3CE}', - '\u{1F3D4}', - '\u{1F3D5}', - '\u{1F3D6}', - '\u{1F3D7}', - '\u{1F3D8}', - '\u{1F3D9}', - '\u{1F3DA}', - '\u{1F3DB}', - '\u{1F3DC}', - '\u{1F3DD}', - '\u{1F3DE}', - '\u{1F3DF}', - '\u{1F3F3}', - '\u{1F3F5}', - '\u{1F3F7}', - '\u{1F43F}', - '\u{1F441}', - '\u{1F4FD}', - '\u{1F549}', - '\u{1F54A}', - '\u{1F56F}', - '\u{1F570}', - '\u{1F573}', - '\u{1F574}', - '\u{1F575}', - '\u{1F576}', - '\u{1F577}', - '\u{1F578}', - '\u{1F579}', - '\u{1F587}', - '\u{1F58A}', - '\u{1F58B}', - '\u{1F58C}', - '\u{1F58D}', - '\u{1F590}', - '\u{1F5A5}', - '\u{1F5A8}', - '\u{1F5B1}', - '\u{1F5B2}', - '\u{1F5BC}', - '\u{1F5C2}', - '\u{1F5C3}', - '\u{1F5C4}', - '\u{1F5D1}', - '\u{1F5D2}', - '\u{1F5D3}', - '\u{1F5DC}', - '\u{1F5DD}', - '\u{1F5DE}', - '\u{1F5E1}', - '\u{1F5E3}', - '\u{1F5E8}', - '\u{1F5EF}', - '\u{1F5F3}', - '\u{1F5FA}', - '\u{1F6CB}', - '\u{1F6CD}', - '\u{1F6CE}', - '\u{1F6CF}', - '\u{1F6E0}', - '\u{1F6E1}', - '\u{1F6E2}', - '\u{1F6E3}', - '\u{1F6E4}', - '\u{1F6E5}', - '\u{1F6E9}', - '\u{1F6F0}', - '\u{1F6F3}', + /// An array of 256 4-bit nibbles. Index with bytes 9-16 (where LSB is 0) + /// of the char you want to test. 0xF means it's not part of a presentation seq, + /// anything else means index into the next table. + static EMOJI_PRESENTATION_INDEX: [u8; 128] = [ + 0xF0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x21, 0x43, 0x65, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x87, 0xA9, 0xFF, 0xFF, + ]; + + /// Array of 512-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`) + /// bitmap with the 9 LSB of your codepoint to get whether it can start an emoji presentation seq. + static EMOJI_PRESENTATION_LEAVES: [[u8; 64]; 11] = [ + [ + 0x00, 0x00, 0x00, 0x00, 0x08, 0x04, 0xFF, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ], + [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x03, 0x00, 0x06, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ], + [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x80, 0x00, 0x00, 0x00, 0xE0, 0x06, 0x07, + ], + [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x40, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, + ], + [ + 0x1F, 0x40, 0x02, 0x21, 0x4D, 0xC4, 0x00, 0x07, 0x05, 0x00, 0x00, 0x80, 0x69, 0x01, + 0x00, 0x48, 0x00, 0x00, 0xF4, 0x1A, 0x81, 0x00, 0x03, 0x00, 0x00, 0x81, 0x0A, 0x00, + 0x00, 0x02, 0x93, 0x03, 0x04, 0xB3, 0x54, 0x20, 0x02, 0x00, 0x18, 0x00, 0x90, 0x00, + 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ], + [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ], + [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ], + [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ], + [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF2, 0x1F, 0x40, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0xC0, 0xCE, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x78, 0xF0, 0xFF, 0x00, 0x00, 0xA8, 0x00, + ], + [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, + 0x00, 0x00, 0x00, 0x80, 0xF9, 0x03, 0x80, 0x3C, 0x01, 0x00, 0x20, 0x01, 0x06, 0x10, + 0x1C, 0x00, 0x0E, 0x70, 0x0A, 0x81, 0x08, 0x04, + ], + [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE8, 0x00, 0x00, + 0x3F, 0x02, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ], ]; } diff --git a/src/tests.rs b/src/tests.rs index 741b459..ff141a3 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -268,4 +268,11 @@ fn test_emoji_presentation() { assert_eq!(UnicodeWidthStr::width("a\u{0023}\u{FE0F}a"), 4); assert_eq!(UnicodeWidthStr::width("\u{0023}a\u{FE0F}"), 2); assert_eq!(UnicodeWidthStr::width("a\u{FE0F}"), 1); + assert_eq!(UnicodeWidthStr::width("\u{0023}\u{0023}\u{FE0F}a"), 4); + + assert_eq!(UnicodeWidthStr::width("\u{002A}\u{FE0F}"), 2); + assert_eq!(UnicodeWidthStr::width("\u{23F9}\u{FE0F}"), 2); + assert_eq!(UnicodeWidthStr::width("\u{24C2}\u{FE0F}"), 2); + assert_eq!(UnicodeWidthStr::width("\u{1F6F3}\u{FE0F}"), 2); + assert_eq!(UnicodeWidthStr::width("\u{1F700}\u{FE0F}"), 1); }