Skip to content

Commit

Permalink
Make characters with Line_Break=Ambiguous ambiguous
Browse files Browse the repository at this point in the history
  • Loading branch information
Jules-Bertholet committed Jun 9, 2024
1 parent afab363 commit acaacf2
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 15 deletions.
12 changes: 12 additions & 0 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,19 @@
# - DerivedCoreProperties.txt
# - EastAsianWidth.txt
# - HangulSyllableType.txt
# - LineBreak.txt
# - NormalizationTest.txt (for tests only)
# - PropList.txt
# - ReadMe.txt
# - Scripts.txt
# - UnicodeData.txt
# - emoji/emoji-data.txt
# - emoji/emoji-test.txt (for tests only)
# - emoji/emoji-variation-sequences.txt
# - extracted/DerivedCombiningClass.txt
# - extracted/DerivedGeneralCategory.txt
# - extracted/DerivedJoiningGroup.txt
# - extracted/DerivedJoiningType.txt
#
# Since this should not require frequent updates, we just store this
# out-of-line and check the generated module into git.
Expand Down Expand Up @@ -429,6 +434,13 @@ def load_east_asian_widths() -> list[EastAsianWidth]:
# Catch any leftover codepoints and assign them implicit Neutral/narrow width.
width_map.append(EastAsianWidth.NARROW)

# Characters with ambiguous line breaking are ambiguous
load_property(
"LineBreak.txt",
"AI",
lambda cp: (operator.setitem(width_map, cp, EastAsianWidth.AMBIGUOUS)),
)

# Characters from alphabetic scripts are narrow
load_property(
"Scripts.txt",
Expand Down
11 changes: 8 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,11 @@
//! 5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
//! 6. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:
//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or
//! has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or
//! is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387), and
//! - Fulfills one of the following conditions:
//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or
//! - Has a [`Line_Break`] of [`AI`], or
//! - Has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or
//! - Is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387); and
//! - Does not have a [`General_Category`] of `Modifier_Symbol`, and
//! - Does not have a [`Script`] of `Latin`, `Greek`, or `Cyrillic`, or is a Roman numeral in the range `'\u{2160}'..='\u{217F}'`.
//! 7. All other characters have width 1.
Expand All @@ -136,12 +138,15 @@
//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593
//! [`Joining_Group`]: https://www.unicode.org/versions/Unicode14.0.0/ch09.pdf#G36862
//! [`Joining_Type`]: http://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G50009
//! [`Line_Break`]: https://www.unicode.org/reports/tr14/#LD5
//! [`Prepended_Concatenation_Mark`]: https://www.unicode.org/versions/Unicode15.0.0/ch23.pdf#G37908
//! [`Script`]: https://www.unicode.org/reports/tr24/#Script
//!
//! [`Fullwidth`]: https://www.unicode.org/reports/tr11/#ED2
//! [`Wide`]: https://www.unicode.org/reports/tr11/#ED4
//! [`Ambiguous`]: https://www.unicode.org/reports/tr11/#ED6
//!
//! [`AI`]: https://www.unicode.org/reports/tr14/#AI
//!
//! [combining marks]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G30602
//!
Expand Down
24 changes: 12 additions & 12 deletions src/tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1030,8 +1030,8 @@ static WIDTH_MIDDLE: Align64<[[u8; 64]; WIDTH_MIDDLE_LEN]> = Align64([
],
#[cfg(feature = "cjk")]
[
0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0x2E, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE,
0x33, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0xAF, 0x02, 0x02, 0x35, 0x36, 0x37, 0x02, 0x38,
0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0x2E, 0xA8, 0x39, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD,
0xAE, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0xAF, 0x02, 0x02, 0x35, 0x36, 0x37, 0x02, 0x38,
0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0xB0, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
0x39, 0x39, 0x39, 0x39,
Expand Down Expand Up @@ -1884,7 +1884,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
#[cfg(feature = "cjk")]
[
0x95, 0x59, 0x59, 0x55, 0x95, 0x65, 0x55, 0x55, 0x69, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x55, 0x55, 0x55, 0x55, 0x95, 0x56, 0x95, 0x6A, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA,
0x55, 0x55, 0x55, 0x55, 0x55, 0x95, 0x5A, 0x95, 0x6A, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA,
0x5A, 0x55,
],
#[cfg(feature = "cjk")]
Expand Down Expand Up @@ -1920,13 +1920,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
#[cfg(feature = "cjk")]
[
0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x9A, 0xAA, 0xAA, 0xAA,
0xAA, 0xAA,
],
#[cfg(feature = "cjk")]
[
0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
0xAA, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x55,
0xAA, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x56,
0x55, 0x55,
],
#[cfg(feature = "cjk")]
Expand All @@ -1937,7 +1931,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
],
#[cfg(feature = "cjk")]
[
0x55, 0x69, 0x59, 0xA5, 0x55, 0x5F, 0x55, 0x66, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x69, 0x59, 0xA5, 0x55, 0xAF, 0x55, 0x66, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x66, 0x55, 0xFF, 0xFF, 0xFF, 0x55, 0x55, 0x55, 0x9A, 0x9A, 0x6A, 0x9A, 0x55, 0x55,
0x55, 0xD5,
],
Expand All @@ -1954,6 +1948,12 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
0xAA, 0xAA,
],
#[cfg(feature = "cjk")]
[
0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xFD, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x57, 0x55, 0x55,
0xD5, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x55,
],
#[cfg(feature = "cjk")]
[
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0xD5, 0x57, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x55, 0x55, 0x55, 0x55, 0x57, 0xAD, 0x5A, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
Expand All @@ -1979,7 +1979,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
],
#[cfg(feature = "cjk")]
[
0xAA, 0xAA, 0x6A, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A, 0xAA, 0xAA, 0xAA,
0xAA, 0xAA, 0xAA, 0x56, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A, 0xAA, 0xAA, 0xAA,
0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A, 0x55, 0xAA, 0xAA,
0xAA, 0xAA,
],
Expand Down
7 changes: 7 additions & 0 deletions tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,13 @@ fn emoji_test_file() {
}
}

#[test]
fn ambiguous_line_break() {
assert_width!("\u{24EA}", 1, 2);
assert_width!("\u{2616}", 1, 2);
assert_width!("\u{2780}", 1, 2);
}

// Test traits are unsealed

#[cfg(feature = "cjk")]
Expand Down

0 comments on commit acaacf2

Please sign in to comment.