From 5b31ba1aa877da5cace5af5b50970f89903972cf Mon Sep 17 00:00:00 2001 From: Lieselotte <52315535+she3py@users.noreply.github.com> Date: Sun, 13 Oct 2024 00:57:38 +0200 Subject: [PATCH] Expose all iterator constructors, add hyperlinks to Unicode glossary/technical reports --- src/decompose.rs | 2 +- src/lib.rs | 17 +++++++++++------ src/normalize.rs | 11 ++++++----- src/replace.rs | 15 ++++++++++++--- src/stream_safe.rs | 13 ++++++++++--- 5 files changed, 40 insertions(+), 18 deletions(-) diff --git a/src/decompose.rs b/src/decompose.rs index 2c73383..b11b1f5 100644 --- a/src/decompose.rs +++ b/src/decompose.rs @@ -53,7 +53,7 @@ impl> Decompositions { /// Create a new decomposition iterator for compatability decompositions (NFkD) /// - /// Note that this iterator can also be obtained by directly calling [`.nfd()`](crate::UnicodeNormalization::nfd) + /// Note that this iterator can also be obtained by directly calling [`.nfkd()`](crate::UnicodeNormalization::nfkd) /// on the iterator. #[inline] pub fn new_compatible(iter: I) -> Decompositions { diff --git a/src/lib.rs b/src/lib.rs index 6c5e029..963d41a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -113,8 +113,8 @@ pub trait UnicodeNormalization> { /// (compatibility decomposition followed by canonical composition). fn nfkc(self) -> Recompositions; - /// A transformation which replaces CJK Compatibility Ideograph codepoints - /// with normal forms using Standardized Variation Sequences. This is not + /// A transformation which replaces [CJK Compatibility Ideograph] codepoints + /// with normal forms using [Standardized Variation Sequences]. This is not /// part of the canonical or compatibility decomposition algorithms, but /// performing it before those algorithms produces normalized output which /// better preserves the intent of the original text. @@ -123,10 +123,15 @@ pub trait UnicodeNormalization> { /// may not immediately help text display as intended, but they at /// least preserve the information in a standardized form, giving /// implementations the option to recognize them. + /// + /// [CJK Compatibility Ideograph]: https://www.unicode.org/glossary/#compatibility_ideograph + /// [Standardized Variation Sequences]: https://www.unicode.org/glossary/#standardized_variation_sequence fn cjk_compat_variants(self) -> Replacements; /// An Iterator over the string with Conjoining Grapheme Joiner characters - /// inserted according to the Stream-Safe Text Process (UAX15-D4) + /// inserted according to the Stream-Safe Text Process ([UAX15-D4]). + /// + /// [UAX15-D4]: https://www.unicode.org/reports/tr15/#UAX15-D4 fn stream_safe(self) -> StreamSafe; } @@ -153,7 +158,7 @@ impl<'a> UnicodeNormalization> for &'a str { #[inline] fn cjk_compat_variants(self) -> Replacements> { - replace::new_cjk_compat_variants(self.chars()) + Replacements::new_cjk_compat_variants(self.chars()) } #[inline] @@ -185,7 +190,7 @@ impl UnicodeNormalization> for char { #[inline] fn cjk_compat_variants(self) -> Replacements> { - replace::new_cjk_compat_variants(Some(self).into_iter()) + Replacements::new_cjk_compat_variants(Some(self).into_iter()) } #[inline] @@ -217,7 +222,7 @@ impl> UnicodeNormalization for I { #[inline] fn cjk_compat_variants(self) -> Replacements { - replace::new_cjk_compat_variants(self) + Replacements::new_cjk_compat_variants(self) } #[inline] diff --git a/src/normalize.rs b/src/normalize.rs index e59b667..96277f1 100644 --- a/src/normalize.rs +++ b/src/normalize.rs @@ -41,11 +41,12 @@ pub fn decompose_compatible(c: char, emit_char: F) { /// /// [Standardized Variation Sequences] are used instead of the standard canonical /// decompositions, notably for CJK codepoints with singleton canonical decompositions, -/// to avoid losing information. See the -/// [Unicode Variation Sequence FAQ](http://unicode.org/faq/vs.html) and the -/// "Other Enhancements" section of the -/// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary) -/// for more information. +/// to avoid losing information. See the [Unicode Variation Sequence FAQ] and the +/// "Other Enhancements" section of the [Unicode 6.3 Release Summary] for more information. +/// +/// [Standardized Variation Sequences]: https://www.unicode.org/glossary/#standardized_variation_sequence +/// [Unicode Variation Sequence FAQ]: http://unicode.org/faq/vs.html +/// [Unicode 6.3 Release Summary]: https://www.unicode.org/versions/Unicode6.3.0/#Summary #[inline] pub fn decompose_cjk_compat_variants(c: char, mut emit_char: F) where diff --git a/src/replace.rs b/src/replace.rs index 3ab2a57..b21a669 100644 --- a/src/replace.rs +++ b/src/replace.rs @@ -22,9 +22,18 @@ pub struct Replacements { buffer: Option, } -#[inline] -pub fn new_cjk_compat_variants>(iter: I) -> Replacements { - Replacements { iter, buffer: None } +impl> Replacements { + /// Create a new iterator that replaces [CJK Compatibility Ideograph] codepoints with normal forms using [Standardized Variation Sequences]. + /// + /// Note that this iterator can also be obtained by directly calling [`.cjk_compat_variants()`] on the iterator. + /// + /// [CJK Compatibility Ideograph]: https://www.unicode.org/glossary/#compatibility_ideograph + /// [Standardized Variation Sequences]: https://www.unicode.org/glossary/#standardized_variation_sequence + /// [`.cjk_compat_variants()`]: crate::UnicodeNormalization::cjk_compat_variants + #[inline] + pub fn new_cjk_compat_variants(iter: I) -> Replacements { + Replacements { iter, buffer: None } + } } impl> Iterator for Replacements { diff --git a/src/stream_safe.rs b/src/stream_safe.rs index 86498d6..5e77bee 100644 --- a/src/stream_safe.rs +++ b/src/stream_safe.rs @@ -10,17 +10,24 @@ use crate::tables::stream_safe_leading_nonstarters; pub(crate) const MAX_NONSTARTERS: usize = 30; const COMBINING_GRAPHEME_JOINER: char = '\u{034F}'; -/// UAX15-D4: This iterator keeps track of how many non-starters there have been +/// [UAX15-D4]: This iterator keeps track of how many non-starters there have been /// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner /// (U+034F) if the count exceeds 30. +/// +/// [UAX15-D4]: https://www.unicode.org/reports/tr15/#UAX15-D4 pub struct StreamSafe { iter: I, nonstarter_count: usize, buffer: Option, } -impl StreamSafe { - pub(crate) fn new(iter: I) -> Self { +impl> StreamSafe { + /// Create a new stream safe iterator. + /// + /// Note that this iterator can also be obtained by directly calling [`.stream_safe()`](crate::UnicodeNormalization::stream_safe) + /// on the iterator. + #[inline] + pub fn new(iter: I) -> Self { Self { iter, nonstarter_count: 0,