Skip to content

Commit

Permalink
Treat emoji presentation sequences as fullwidth
Browse files Browse the repository at this point in the history
  • Loading branch information
Jules-Bertholet committed Feb 13, 2024
1 parent fda272b commit d1afcac
Show file tree
Hide file tree
Showing 4 changed files with 319 additions and 7 deletions.
72 changes: 67 additions & 5 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,13 @@ def fetch_open(filename: str):
"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
"""
basename = os.path.basename(filename)
if not os.path.exists(os.path.basename(filename)):
os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
try:
return open(filename, encoding="utf-8")
return open(basename, encoding="utf-8")
except OSError:
sys.stderr.write(f"cannot load {filename}")
sys.stderr.write(f"cannot load {basename}")
sys.exit(1)


Expand Down Expand Up @@ -151,7 +152,7 @@ def load_zero_widths() -> "list[bool]":
character. `c` is considered a zero-width character if
- it is in general categories `Cc`, `Mn`, or `Me` (determined from `UnicodeData.txt`),
- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`) and is not U+115F,
- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
"""

Expand Down Expand Up @@ -388,8 +389,29 @@ def make_tables(
return tables


def variation_sequences() -> "list[tuple[int, int]]":
"""Outputs a list of character ranages, corresponding to all the valid characters for starting
an emoji presentation sequence."""

with fetch_open("emoji/emoji-variation-sequences.txt") as sequences:
sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style")
ranges = []
for line in sequences.readlines():
if match := sequence.match(line):
cp = int(match.group(1), 16)
if ranges != [] and ranges[-1][1] == cp - 1:
ranges[-1] = (ranges[-1][0], cp)
else:
ranges.append((cp, cp))

return ranges


def emit_module(
out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]"
out_name: str,
unicode_version: "tuple[int, int, int]",
tables: "list[Table]",
emoji_variations: "list[tuple[int, int]]",
):
"""Outputs a Rust module to `out_name` using table data from `tables`.
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
Expand Down Expand Up @@ -468,6 +490,31 @@ def emit_module(
"""
)

module.write(
"""
/// Whether this character forms an [emoji presentation sequence]
/// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
/// when followed by `'\\u{FEOF}'`.
/// Emoji presentation sequences are considered to have width 2.
#[inline]
pub fn starts_emoji_presentation_seq(c: char) -> bool {
use core::cmp::Ordering::{Equal, Greater, Less};
EMOJI_PRESENTATION_RANGES
.binary_search_by(|&(lo, hi)| {
if lo > c {
Greater
} else if hi < c {
Less
} else {
Equal
}
})
.is_ok()
}
"""
)

module.write(
"""
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
Expand Down Expand Up @@ -516,6 +563,20 @@ def emit_module(
module.write(f" 0x{byte:02X},")
module.write("\n ];\n")
subtable_count = new_subtable_count

# emoji table

module.write(
f"""
/// Each tuple corresponds to a range (inclusive at both ends)
/// of characters that can start an emoji presentation sequence.
static EMOJI_PRESENTATION_RANGES: [(char, char); {len(emoji_variations)}] = [
"""
)
for lo, hi in emoji_variations:
module.write(f" ('\\u{{{lo:X}}}', '\\u{{{hi:X}}}'),\n")
module.write(" ];\n")

module.write("}\n")


Expand Down Expand Up @@ -551,6 +612,7 @@ def main(module_filename: str):
width_map[0x00AD] = EffectiveWidth.NARROW

tables = make_tables(TABLE_CFGS, enumerate(width_map))
emoji_variations = variation_sequences()

print("------------------------")
total_size = 0
Expand All @@ -561,7 +623,7 @@ def main(module_filename: str):
print("------------------------")
print(f" Total Size: {total_size} bytes")

emit_module(module_filename, version, tables)
emit_module(module_filename, version, tables, emoji_variations)
print(f'Wrote to "{module_filename}"')


Expand Down
31 changes: 29 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ pub trait UnicodeWidthStr {
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// as 1 column wide. This is consistent with the recommendations for
/// non-CJK contexts, or when the context cannot be reliably determined.
///
/// Also consistent with UAX11, this function treats [emoji presentation sequences]
/// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
/// as 2 columns wide. This means that the width of a string may not equal
/// the sum of the widths of its individual characters.
fn width(&self) -> usize;

/// Returns the string's displayed width in columns.
Expand All @@ -118,17 +123,39 @@ pub trait UnicodeWidthStr {
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// as 2 column wide. This is consistent with the recommendations for
/// CJK contexts.
///
/// Also consistent with UAX11, this function treats [emoji presentation sequences]
/// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
/// as 2 columns wide. This means that the width of a string may not equal
/// the sum of the widths of its individual characters.
fn width_cjk(&self) -> usize;
}

impl UnicodeWidthStr for str {
#[inline]
fn width(&self) -> usize {
self.chars().map(|c| cw::width(c, false).unwrap_or(0)).sum()
str_width(self, false)
}

#[inline]
fn width_cjk(&self) -> usize {
self.chars().map(|c| cw::width(c, true).unwrap_or(0)).sum()
str_width(self, true)
}
}

fn str_width(s: &str, is_cjk: bool) -> usize {
s.chars()
.rfold((0, false), |(sum, was_fe0f), c| {
if c == '\u{FE0F}' {
(sum, true)
} else {
let add = if was_fe0f && cw::starts_emoji_presentation_seq(c) {
2
} else {
cw::width(c, is_cjk).unwrap_or(0)
};
(sum + add, false)
}
})
.0
}
Loading

0 comments on commit d1afcac

Please sign in to comment.