Skip to content

Commit

Permalink
Treat Default_Ignorable_Code_Points as zero-width
Browse files Browse the repository at this point in the history
  • Loading branch information
Jules-Bertholet committed Feb 10, 2024
1 parent 8942487 commit aed33e9
Show file tree
Hide file tree
Showing 4 changed files with 246 additions and 200 deletions.
32 changes: 28 additions & 4 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,14 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":
def load_zero_widths() -> "list[bool]":
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
character. `c` is considered a zero-width character if `c` is in general categories
`Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`)."""
`Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`),
or if it has the `Default_Ignorable_Code_Point` property (determined by fetching
and processing `DerivedCoreProperties.txt`)."""

zw_cat_codes = ["Cc", "Cf", "Mn", "Me"]
zw_map = []

with fetch_open("UnicodeData.txt") as categories:
zw_map = []
current = 0
for line in categories.readlines():
if len(raw_data := line.split(";")) != 15:
Expand All @@ -159,7 +164,7 @@ def load_zero_widths() -> "list[bool]":
raw_data[1],
raw_data[2],
]
zero_width = cat_code in ["Cc", "Cf", "Mn", "Me"]
zero_width = cat_code in zw_cat_codes

assert current <= codepoint
while current <= codepoint:
Expand All @@ -176,7 +181,26 @@ def load_zero_widths() -> "list[bool]":
# Catch any leftover codepoints. They must be unassigned (so nonzero width)
zw_map.append(False)

return zw_map
with fetch_open("DerivedCoreProperties.txt") as properties:
single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point +# (\w+)")
multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point +# (\w+)")

for line in properties.readlines():
raw_data = None # (low, high, category)
if match := single.match(line):
raw_data = (match.group(1), match.group(1), match.group(2))
elif match := multiple.match(line):
raw_data = (match.group(1), match.group(2), match.group(3))
else:
continue
low = int(raw_data[0], 16)
high = int(raw_data[1], 16)
cat = raw_data[2]
if cat not in zw_cat_codes:
for cp in range(low, high + 1):
zw_map[cp] = True

return zw_map


class Bucket:
Expand Down
19 changes: 12 additions & 7 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,10 @@
//! ```
#![deny(missing_docs, unsafe_code)]
#![doc(html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png")]

#![doc(
html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
)]
#![cfg_attr(feature = "bench", feature(test))]
#![no_std]

Expand Down Expand Up @@ -87,10 +88,14 @@ pub trait UnicodeWidthChar {

impl UnicodeWidthChar for char {
#[inline]
fn width(self) -> Option<usize> { cw::width(self, false) }
fn width(self) -> Option<usize> {
cw::width(self, false)
}

#[inline]
fn width_cjk(self) -> Option<usize> { cw::width(self, true) }
fn width_cjk(self) -> Option<usize> {
cw::width(self, true)
}
}

/// Methods for determining displayed width of Unicode strings.
Expand All @@ -103,7 +108,7 @@ pub trait UnicodeWidthStr {
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// as 1 column wide. This is consistent with the recommendations for
/// non-CJK contexts, or when the context cannot be reliably determined.
fn width<'a>(&'a self) -> usize;
fn width(&self) -> usize;

/// Returns the string's displayed width in columns.
///
Expand All @@ -113,7 +118,7 @@ pub trait UnicodeWidthStr {
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// as 2 column wide. This is consistent with the recommendations for
/// CJK contexts.
fn width_cjk<'a>(&'a self) -> usize;
fn width_cjk(&self) -> usize;
}

impl UnicodeWidthStr for str {
Expand Down
Loading

0 comments on commit aed33e9

Please sign in to comment.