Skip to content

Commit

Permalink
Merge pull request unicode-rs#106 from ShE3py/iter-ctors
Browse files Browse the repository at this point in the history
Expose all iterator constructors, add hyperlinks to Unicode glossary/technical reports
  • Loading branch information
Manishearth authored Oct 14, 2024
2 parents c992130 + 5b31ba1 commit 9d5d794
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 18 deletions.
2 changes: 1 addition & 1 deletion src/decompose.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ impl<I: Iterator<Item = char>> Decompositions<I> {

/// Create a new decomposition iterator for compatability decompositions (NFkD)
///
/// Note that this iterator can also be obtained by directly calling [`.nfd()`](crate::UnicodeNormalization::nfd)
/// Note that this iterator can also be obtained by directly calling [`.nfkd()`](crate::UnicodeNormalization::nfkd)
/// on the iterator.
#[inline]
pub fn new_compatible(iter: I) -> Decompositions<I> {
Expand Down
17 changes: 11 additions & 6 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
/// (compatibility decomposition followed by canonical composition).
fn nfkc(self) -> Recompositions<I>;

/// A transformation which replaces CJK Compatibility Ideograph codepoints
/// with normal forms using Standardized Variation Sequences. This is not
/// A transformation which replaces [CJK Compatibility Ideograph] codepoints
/// with normal forms using [Standardized Variation Sequences]. This is not
/// part of the canonical or compatibility decomposition algorithms, but
/// performing it before those algorithms produces normalized output which
/// better preserves the intent of the original text.
Expand All @@ -123,10 +123,15 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
/// may not immediately help text display as intended, but they at
/// least preserve the information in a standardized form, giving
/// implementations the option to recognize them.
///
/// [CJK Compatibility Ideograph]: https://www.unicode.org/glossary/#compatibility_ideograph
/// [Standardized Variation Sequences]: https://www.unicode.org/glossary/#standardized_variation_sequence
fn cjk_compat_variants(self) -> Replacements<I>;

/// An Iterator over the string with Conjoining Grapheme Joiner characters
/// inserted according to the Stream-Safe Text Process (UAX15-D4)
/// inserted according to the Stream-Safe Text Process ([UAX15-D4]).
///
/// [UAX15-D4]: https://www.unicode.org/reports/tr15/#UAX15-D4
fn stream_safe(self) -> StreamSafe<I>;
}

Expand All @@ -153,7 +158,7 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {

#[inline]
fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
replace::new_cjk_compat_variants(self.chars())
Replacements::new_cjk_compat_variants(self.chars())
}

#[inline]
Expand Down Expand Up @@ -185,7 +190,7 @@ impl UnicodeNormalization<option::IntoIter<char>> for char {

#[inline]
fn cjk_compat_variants(self) -> Replacements<option::IntoIter<char>> {
replace::new_cjk_compat_variants(Some(self).into_iter())
Replacements::new_cjk_compat_variants(Some(self).into_iter())
}

#[inline]
Expand Down Expand Up @@ -217,7 +222,7 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {

#[inline]
fn cjk_compat_variants(self) -> Replacements<I> {
replace::new_cjk_compat_variants(self)
Replacements::new_cjk_compat_variants(self)
}

#[inline]
Expand Down
11 changes: 6 additions & 5 deletions src/normalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,12 @@ pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
///
/// [Standardized Variation Sequences] are used instead of the standard canonical
/// decompositions, notably for CJK codepoints with singleton canonical decompositions,
/// to avoid losing information. See the
/// [Unicode Variation Sequence FAQ](http://unicode.org/faq/vs.html) and the
/// "Other Enhancements" section of the
/// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
/// for more information.
/// to avoid losing information. See the [Unicode Variation Sequence FAQ] and the
/// "Other Enhancements" section of the [Unicode 6.3 Release Summary] for more information.
///
/// [Standardized Variation Sequences]: https://www.unicode.org/glossary/#standardized_variation_sequence
/// [Unicode Variation Sequence FAQ]: http://unicode.org/faq/vs.html
/// [Unicode 6.3 Release Summary]: https://www.unicode.org/versions/Unicode6.3.0/#Summary
#[inline]
pub fn decompose_cjk_compat_variants<F>(c: char, mut emit_char: F)
where
Expand Down
15 changes: 12 additions & 3 deletions src/replace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,18 @@ pub struct Replacements<I> {
buffer: Option<char>,
}

#[inline]
pub fn new_cjk_compat_variants<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {
Replacements { iter, buffer: None }
impl<I: Iterator<Item = char>> Replacements<I> {
/// Create a new iterator that replaces [CJK Compatibility Ideograph] codepoints with normal forms using [Standardized Variation Sequences].
///
/// Note that this iterator can also be obtained by directly calling [`.cjk_compat_variants()`] on the iterator.
///
/// [CJK Compatibility Ideograph]: https://www.unicode.org/glossary/#compatibility_ideograph
/// [Standardized Variation Sequences]: https://www.unicode.org/glossary/#standardized_variation_sequence
/// [`.cjk_compat_variants()`]: crate::UnicodeNormalization::cjk_compat_variants
#[inline]
pub fn new_cjk_compat_variants(iter: I) -> Replacements<I> {
Replacements { iter, buffer: None }
}
}

impl<I: Iterator<Item = char>> Iterator for Replacements<I> {
Expand Down
13 changes: 10 additions & 3 deletions src/stream_safe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,24 @@ use crate::tables::stream_safe_leading_nonstarters;
pub(crate) const MAX_NONSTARTERS: usize = 30;
const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';

/// UAX15-D4: This iterator keeps track of how many non-starters there have been
/// [UAX15-D4]: This iterator keeps track of how many non-starters there have been
/// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner
/// (U+034F) if the count exceeds 30.
///
/// [UAX15-D4]: https://www.unicode.org/reports/tr15/#UAX15-D4
pub struct StreamSafe<I> {
iter: I,
nonstarter_count: usize,
buffer: Option<char>,
}

impl<I> StreamSafe<I> {
pub(crate) fn new(iter: I) -> Self {
impl<I: Iterator<Item = char>> StreamSafe<I> {
/// Create a new stream safe iterator.
///
/// Note that this iterator can also be obtained by directly calling [`.stream_safe()`](crate::UnicodeNormalization::stream_safe)
/// on the iterator.
#[inline]
pub fn new(iter: I) -> Self {
Self {
iter,
nonstarter_count: 0,
Expand Down

0 comments on commit 9d5d794

Please sign in to comment.