Skip to content

Commit

Permalink
Don't handle noncharacters differently than other unassigned codepoints
Browse files Browse the repository at this point in the history
  • Loading branch information
Jules-Bertholet committed Mar 13, 2024
1 parent 0b13808 commit 4347629
Show file tree
Hide file tree
Showing 4 changed files with 4 additions and 31 deletions.
10 changes: 1 addition & 9 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def _load_unicode_data(self):

# Characters that cannot be part of a combining character sequence:
# control characters, format characters other than ZWJ and ZWNJ,
# the line and paragraph separators, and noncharacters.
# and the line and paragraph separators.
self.not_in_ccs = []

assigned_start = 0;
Expand Down Expand Up @@ -147,14 +147,6 @@ def _load_unicode_data(self):

self.general_category_public_assigned.append((assigned_start, prev_char_int))

# Mark noncharacters as nongraphic
for i in range(0xFDD0, 0xFDF0):
self.not_in_ccs.append(i)
for prefix in range(0, 0x11):
shifted = prefix << 16
self.not_in_ccs.append(shifted | 0xFFFE)
self.not_in_ccs.append(shifted | 0xFFFF)

self.not_in_ccs.sort()

def _load_default_ignorable_marks(self):
Expand Down
3 changes: 1 addition & 2 deletions src/correct_ccs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,7 @@ impl CcsKind {
/// [defective combining character sequences](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I6.1.36487)
/// by inserting U+00A0 NO-BREAK SPACE in front of them.
///
/// For the purposes of this iterator, private use characters,
/// as well as unassigned codepoints other than noncharacters,
/// For the purposes of this iterator, private use characters and unassigned codepoints
/// are considered valid base characters,
/// so combining character sequences that follow such will not be modified.
///
Expand Down
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
/// with the correct advance width,
/// in diverse contexts (for example, when printed to a terminal).
///
/// Sequences following a private use character or an unassigned codepoint that is not a noncharacter
/// Sequences following a private use character or an unassigned codepoint
/// are not corrected. Additionally, combining character sequences consisting entirely of
/// [default-ignorable code points](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I8.1.40715)
/// are also left untouched. Handling this last case may require the iterator
Expand Down
20 changes: 1 addition & 19 deletions src/tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20973,33 +20973,15 @@ pub fn not_in_ccs(c: char) -> bool {
| '\u{2028}'..='\u{202E}'
| '\u{2060}'..='\u{2064}'
| '\u{2066}'..='\u{206F}'
| '\u{FDD0}'..='\u{FDEF}'
| '\u{FEFF}'
| '\u{FFF9}'..='\u{FFFB}'
| '\u{FFFE}'..='\u{FFFF}'
| '\u{110BD}'
| '\u{110CD}'
| '\u{13430}'..='\u{1343F}'
| '\u{1BCA0}'..='\u{1BCA3}'
| '\u{1D173}'..='\u{1D17A}'
| '\u{1FFFE}'..='\u{1FFFF}'
| '\u{2FFFE}'..='\u{2FFFF}'
| '\u{3FFFE}'..='\u{3FFFF}'
| '\u{4FFFE}'..='\u{4FFFF}'
| '\u{5FFFE}'..='\u{5FFFF}'
| '\u{6FFFE}'..='\u{6FFFF}'
| '\u{7FFFE}'..='\u{7FFFF}'
| '\u{8FFFE}'..='\u{8FFFF}'
| '\u{9FFFE}'..='\u{9FFFF}'
| '\u{AFFFE}'..='\u{AFFFF}'
| '\u{BFFFE}'..='\u{BFFFF}'
| '\u{CFFFE}'..='\u{CFFFF}'
| '\u{DFFFE}'..='\u{DFFFF}'
| '\u{E0001}'
| '\u{E0020}'..='\u{E007F}'
| '\u{EFFFE}'..='\u{EFFFF}'
| '\u{FFFFE}'..='\u{FFFFF}'
| '\u{10FFFE}'..='\u{10FFFF}' => true,
| '\u{E0020}'..='\u{E007F}' => true,
_ => false,
}
}
Expand Down

0 comments on commit 4347629

Please sign in to comment.