Don't handle noncharacters differently than other unassigned codepoints

unicode-rs · Mar 13, 2024 · 4347629 · 4347629
1 parent 0b13808
commit 4347629
Show file tree

Hide file tree

Showing 4 changed files with 4 additions and 31 deletions.
diff --git a/scripts/unicode.py b/scripts/unicode.py
@@ -106,7 +106,7 @@ def _load_unicode_data(self):
 
         # Characters that cannot be part of a combining character sequence:
         # control characters, format characters other than ZWJ and ZWNJ,
-        # the line and paragraph separators, and noncharacters.
+        # and the line and paragraph separators.
         self.not_in_ccs = []
 
         assigned_start = 0;
@@ -147,14 +147,6 @@ def _load_unicode_data(self):
 
         self.general_category_public_assigned.append((assigned_start, prev_char_int))
 
-        # Mark noncharacters as nongraphic
-        for i in range(0xFDD0, 0xFDF0):
-            self.not_in_ccs.append(i)
-        for prefix in range(0, 0x11):
-            shifted = prefix << 16
-            self.not_in_ccs.append(shifted | 0xFFFE)
-            self.not_in_ccs.append(shifted | 0xFFFF)
-
         self.not_in_ccs.sort()
 
     def _load_default_ignorable_marks(self):

diff --git a/src/correct_ccs.rs b/src/correct_ccs.rs
@@ -41,8 +41,7 @@ impl CcsKind {
 /// [defective combining character sequences](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I6.1.36487)
 /// by inserting U+00A0 NO-BREAK SPACE in front of them.
 ///
-/// For the purposes of this iterator, private use characters,
-/// as well as unassigned codepoints other than noncharacters,
+/// For the purposes of this iterator, private use characters and unassigned codepoints
 /// are considered valid base characters,
 /// so combining character sequences that follow such will not be modified.
 ///

diff --git a/src/lib.rs b/src/lib.rs
@@ -139,7 +139,7 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
     /// with the correct advance width,
     /// in diverse contexts (for example, when printed to a terminal).
     ///
-    /// Sequences following a private use character or an unassigned codepoint that is not a noncharacter
+    /// Sequences following a private use character or an unassigned codepoint
     /// are not corrected. Additionally, combining character sequences consisting entirely of
     /// [default-ignorable code points](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I8.1.40715)
     /// are also left untouched. Handling this last case may require the iterator

diff --git a/src/tables.rs b/src/tables.rs
@@ -20973,33 +20973,15 @@ pub fn not_in_ccs(c: char) -> bool {
         | '\u{2028}'..='\u{202E}'
         | '\u{2060}'..='\u{2064}'
         | '\u{2066}'..='\u{206F}'
-        | '\u{FDD0}'..='\u{FDEF}'
         | '\u{FEFF}'
         | '\u{FFF9}'..='\u{FFFB}'
-        | '\u{FFFE}'..='\u{FFFF}'
         | '\u{110BD}'
         | '\u{110CD}'
         | '\u{13430}'..='\u{1343F}'
         | '\u{1BCA0}'..='\u{1BCA3}'
         | '\u{1D173}'..='\u{1D17A}'
-        | '\u{1FFFE}'..='\u{1FFFF}'
-        | '\u{2FFFE}'..='\u{2FFFF}'
-        | '\u{3FFFE}'..='\u{3FFFF}'
-        | '\u{4FFFE}'..='\u{4FFFF}'
-        | '\u{5FFFE}'..='\u{5FFFF}'
-        | '\u{6FFFE}'..='\u{6FFFF}'
-        | '\u{7FFFE}'..='\u{7FFFF}'
-        | '\u{8FFFE}'..='\u{8FFFF}'
-        | '\u{9FFFE}'..='\u{9FFFF}'
-        | '\u{AFFFE}'..='\u{AFFFF}'
-        | '\u{BFFFE}'..='\u{BFFFF}'
-        | '\u{CFFFE}'..='\u{CFFFF}'
-        | '\u{DFFFE}'..='\u{DFFFF}'
         | '\u{E0001}'
-        | '\u{E0020}'..='\u{E007F}'
-        | '\u{EFFFE}'..='\u{EFFFF}'
-        | '\u{FFFFE}'..='\u{FFFFF}'
-        | '\u{10FFFE}'..='\u{10FFFF}' => true,
+        | '\u{E0020}'..='\u{E007F}' => true,
         _ => false,
     }
 }