Skip to content

Commit

Permalink
Add API to transform into KS X 1026-1 standard Korean syllables
Browse files Browse the repository at this point in the history
Gated behind the `ks_x_1026-1` feature.
  • Loading branch information
Jules-Bertholet committed Mar 1, 2024
1 parent c24ac7f commit 887d390
Show file tree
Hide file tree
Showing 3 changed files with 262 additions and 60 deletions.
49 changes: 40 additions & 9 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,10 @@ pub use crate::quick_check::{
};
pub use crate::recompose::Recompositions;
pub use crate::replace::Replacements;
pub use crate::standardize_korean_syllables::StandardKoreanSyllables;
pub use crate::standardize_korean_syllables::StandardizeKoreanSyllables;
#[cfg(feature = "ks_x_1026-1")]
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
pub use crate::standardize_korean_syllables::StandardizeKoreanSyllablesKsX1026_1;
pub use crate::stream_safe::StreamSafe;
pub use crate::tables::UNICODE_VERSION;
use core::{option, str::Chars};
Expand Down Expand Up @@ -148,9 +151,9 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
/// inserted according to the Stream-Safe Text Process ([UAX15-D4](https://unicode.org/reports/tr15/#UAX15-D4))
fn stream_safe(self) -> StreamSafe<I>;

/// An iterator over the string with Hangul choseong and jugseong filler characters inserted
/// An iterator over the string with Hangul choseong and jungseong filler characters inserted
/// to ensure that all Korean syllable blocks are in standard form according to [UAX29](https://www.unicode.org/reports/tr29/#Transforming_Into_SKS).
fn standard_korean_syllables(self) -> StandardKoreanSyllables<I>;
fn standard_korean_syllables(self) -> StandardizeKoreanSyllables<I>;

/// An iterator over the string in the variant of Unicode Normalization Form KD
/// defined by Korean Standard X 1026-1. This normalization differs from that defined by Unicode
Expand Down Expand Up @@ -183,6 +186,12 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]

fn nfkc_ks_x_1026_1(self) -> RecomposeHangul<Recompositions<NormalizeJamoKdkc<I>>>;

/// An iterator over the string with Hangul choseong and jungseong filler characters inserted
/// to ensure that all Korean syllable blocks are in standard form according to KS X 1026-1 § 7.8.
#[cfg(feature = "ks_x_1026-1")]
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
fn standard_korean_syllables_ks_x_1026_1(self) -> StandardizeKoreanSyllablesKsX1026_1<I>;
}

impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
Expand Down Expand Up @@ -217,8 +226,8 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
}

#[inline]
fn standard_korean_syllables(self) -> StandardKoreanSyllables<Chars<'a>> {
StandardKoreanSyllables::new(self.chars())
fn standard_korean_syllables(self) -> StandardizeKoreanSyllables<Chars<'a>> {
StandardizeKoreanSyllables::new(self.chars())
}

#[cfg(feature = "ks_x_1026-1")]
Expand All @@ -243,6 +252,14 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
self.chars(),
)))
}

#[cfg(feature = "ks_x_1026-1")]
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
fn standard_korean_syllables_ks_x_1026_1(
self,
) -> StandardizeKoreanSyllablesKsX1026_1<Chars<'a>> {
StandardizeKoreanSyllablesKsX1026_1::new(self.chars())
}
}

impl UnicodeNormalization<option::IntoIter<char>> for char {
Expand Down Expand Up @@ -277,8 +294,8 @@ impl UnicodeNormalization<option::IntoIter<char>> for char {
}

#[inline]
fn standard_korean_syllables(self) -> StandardKoreanSyllables<option::IntoIter<char>> {
StandardKoreanSyllables::new(Some(self).into_iter())
fn standard_korean_syllables(self) -> StandardizeKoreanSyllables<option::IntoIter<char>> {
StandardizeKoreanSyllables::new(Some(self).into_iter())
}

#[cfg(feature = "ks_x_1026-1")]
Expand All @@ -305,6 +322,14 @@ impl UnicodeNormalization<option::IntoIter<char>> for char {
Some(self).into_iter(),
)))
}

#[cfg(feature = "ks_x_1026-1")]
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
fn standard_korean_syllables_ks_x_1026_1(
self,
) -> StandardizeKoreanSyllablesKsX1026_1<option::IntoIter<char>> {
StandardizeKoreanSyllablesKsX1026_1::new(Some(self).into_iter())
}
}

impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
Expand Down Expand Up @@ -339,8 +364,8 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
}

#[inline]
fn standard_korean_syllables(self) -> StandardKoreanSyllables<I> {
StandardKoreanSyllables::new(self)
fn standard_korean_syllables(self) -> StandardizeKoreanSyllables<I> {
StandardizeKoreanSyllables::new(self)
}

#[cfg(feature = "ks_x_1026-1")]
Expand All @@ -363,4 +388,10 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
fn nfkc_ks_x_1026_1(self) -> RecomposeHangul<Recompositions<NormalizeJamoKdkc<I>>> {
RecomposeHangul::new(recompose::new_compatible(NormalizeJamoKdkc::new(self)))
}

#[cfg(feature = "ks_x_1026-1")]
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
fn standard_korean_syllables_ks_x_1026_1(self) -> StandardizeKoreanSyllablesKsX1026_1<I> {
StandardizeKoreanSyllablesKsX1026_1::new(self)
}
}
215 changes: 172 additions & 43 deletions src/standardize_korean_syllables.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use core::iter::FusedIterator;
use core::{iter::FusedIterator, marker::PhantomData};

use tinyvec::ArrayVec;

Expand Down Expand Up @@ -42,17 +42,27 @@ impl JamoKind {
}
}

/// Iterator over a string's characters, with '\u{115F}' and '\u{1160}' inserted
/// where needed to ensure all Korean syllable blocks are in standard form
/// by [UAX29 rules](https://www.unicode.org/reports/tr29/#Standard_Korean_Syllables).
trait NormalizeKoreanSyllables {
fn insert_fillers(
next_c: Option<char>,
prev_end_jamo_kind: Option<JamoKind>,
next_start_jamo_kind: Option<JamoKind>,
buf: &mut ArrayVec<[Option<char>; 3]>,
) -> Option<char>;
}

// Used to abstract over UAX29 and KS X 1026-1 rules
#[derive(Clone, Debug)]
pub struct StandardKoreanSyllables<I> {
struct StandardizeKoreanSyllablesInner<I, N> {
prev_end_jamo_kind: Option<JamoKind>,
buf: ArrayVec<[Option<char>; 3]>,
inner: I,
normalizer: PhantomData<N>,
}

impl<I: Iterator<Item = char>> Iterator for StandardKoreanSyllables<I> {
impl<I: Iterator<Item = char>, N: NormalizeKoreanSyllables> Iterator
for StandardizeKoreanSyllablesInner<I, N>
{
type Item = char;

fn next(&mut self) -> Option<Self::Item> {
Expand All @@ -65,7 +75,7 @@ impl<I: Iterator<Item = char>> Iterator for StandardKoreanSyllables<I> {
next_c.map_or((None, None), JamoKind::of);
self.prev_end_jamo_kind = next_end_jamo_kind;

insert_fillers(
N::insert_fillers(
next_c,
prev_end_jamo_kind,
next_start_jamo_kind,
Expand All @@ -87,50 +97,169 @@ impl<I: Iterator<Item = char>> Iterator for StandardKoreanSyllables<I> {
}
}

impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for StandardKoreanSyllables<I> {}
impl<I: Iterator<Item = char> + FusedIterator, N: NormalizeKoreanSyllables> FusedIterator
for StandardizeKoreanSyllablesInner<I, N>
{
}

#[inline]
fn insert_fillers(
next_c: Option<char>,
prev_end_jamo_kind: Option<JamoKind>,
next_start_jamo_kind: Option<JamoKind>,
buf: &mut ArrayVec<[Option<char>; 3]>,
) -> Option<char> {
match (prev_end_jamo_kind, next_start_jamo_kind) {
// Insert choseong filler before V not preceded by L or V
(None, Some(JamoKind::V)) | (Some(JamoKind::T), Some(JamoKind::V)) => {
buf.push(next_c);
Some('\u{115F}')
}
// Insert choseong and jungseong fillers before T preceded non-jamo
(None, Some(JamoKind::T)) => {
buf.push(next_c);
buf.push(Some('\u{1160}'));
Some('\u{115F}')
}
// Insert V filler between L and non-jamo
(Some(JamoKind::L), None) => {
buf.push(next_c);
Some('\u{1160}')
impl<I, N> StandardizeKoreanSyllablesInner<I, N> {
#[inline]
fn new(iter: I) -> Self {
Self {
prev_end_jamo_kind: None,
buf: ArrayVec::new(),
inner: iter,
normalizer: PhantomData,
}
// For L followed by T, insert V filler, L filler, then another V filler
(Some(JamoKind::L), Some(JamoKind::T)) => {
buf.push(next_c);
buf.push(Some('\u{1160}'));
buf.push(Some('\u{115F}'));
Some('\u{1160}')
}
}

// UAX 29 normalization

#[derive(Clone, Debug)]
struct Uax29;

impl NormalizeKoreanSyllables for Uax29 {
#[inline]
fn insert_fillers(
next_c: Option<char>,
prev_end_jamo_kind: Option<JamoKind>,
next_start_jamo_kind: Option<JamoKind>,
buf: &mut ArrayVec<[Option<char>; 3]>,
) -> Option<char> {
match (prev_end_jamo_kind, next_start_jamo_kind) {
// Insert choseong filler before V not preceded by L or V
(None, Some(JamoKind::V)) | (Some(JamoKind::T), Some(JamoKind::V)) => {
buf.push(next_c);
Some('\u{115F}')
}
// Insert choseong and jungseong fillers before T preceded non-jamo
(None, Some(JamoKind::T)) => {
buf.push(next_c);
buf.push(Some('\u{1160}'));
Some('\u{115F}')
}
// Insert V filler between L and non-jamo
(Some(JamoKind::L), None) => {
buf.push(next_c);
Some('\u{1160}')
}
// For L followed by T, insert V filler, L filler, then another V filler
(Some(JamoKind::L), Some(JamoKind::T)) => {
buf.push(next_c);
buf.push(Some('\u{1160}'));
buf.push(Some('\u{115F}'));
Some('\u{1160}')
}
_ => next_c,
}
_ => next_c,
}
}

impl<I> StandardKoreanSyllables<I> {
/// Iterator over a string's characters, with U+115F and U+1160 inserted
/// where needed to ensure all Korean syllable blocks are in standard form
/// by [UAX29 rules](https://www.unicode.org/reports/tr29/#Standard_Korean_Syllables).
#[derive(Clone, Debug)]
pub struct StandardizeKoreanSyllables<I>(StandardizeKoreanSyllablesInner<I, Uax29>);

impl<I> StandardizeKoreanSyllables<I> {
#[inline]
pub(crate) fn new(iter: I) -> Self {
Self {
prev_end_jamo_kind: None,
buf: ArrayVec::new(),
inner: iter,
Self(StandardizeKoreanSyllablesInner::new(iter))
}
}

impl<I: Iterator<Item = char>> Iterator for StandardizeKoreanSyllables<I> {
type Item = char;

fn next(&mut self) -> Option<Self::Item> {
self.0.next()
}

fn size_hint(&self) -> (usize, Option<usize>) {
self.0.size_hint()
}
}

impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for StandardizeKoreanSyllables<I> {}

// KS X 1026 1 normalization

#[cfg(feature = "ks_x_1026-1")]
#[derive(Clone, Debug)]
struct KsX1026_1;

#[cfg(feature = "ks_x_1026-1")]
impl NormalizeKoreanSyllables for KsX1026_1 {
#[inline]
fn insert_fillers(
next_c: Option<char>,
prev_end_jamo_kind: Option<JamoKind>,
next_start_jamo_kind: Option<JamoKind>,
buf: &mut ArrayVec<[Option<char>; 3]>,
) -> Option<char> {
match (prev_end_jamo_kind, next_start_jamo_kind) {
// Insert choseong filler before V preceded by V, T or non-jamo
(None, Some(JamoKind::V))
| (Some(JamoKind::V), Some(JamoKind::V))
| (Some(JamoKind::T), Some(JamoKind::V)) => {
buf.push(next_c);
Some('\u{115F}')
}
// Insert choseong and jungseong fillers before T preceded by T or non-jamo
(None, Some(JamoKind::T)) | (Some(JamoKind::T), Some(JamoKind::T)) => {
buf.push(next_c);
buf.push(Some('\u{1160}'));
Some('\u{115F}')
}
// Insert V filler between L and non-jamo or other L
(Some(JamoKind::L), None) | (Some(JamoKind::L), Some(JamoKind::L)) => {
buf.push(next_c);
Some('\u{1160}')
}
// For L followed by T, insert V filler, L filler, then another V filler
(Some(JamoKind::L), Some(JamoKind::T)) => {
buf.push(next_c);
buf.push(Some('\u{1160}'));
buf.push(Some('\u{115F}'));
Some('\u{1160}')
}
_ => next_c,
}
}
}

/// Iterator over a string's characters, with U+115F and U+1160 inserted
/// where needed to ensure all Korean syllable blocks are in standard form
/// by [KS X 1026-1](http://std.dkuug.dk/jtc1/sc2/wg2/docs/n3422.pdf) rules.
#[cfg(feature = "ks_x_1026-1")]
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
#[derive(Clone, Debug)]
pub struct StandardizeKoreanSyllablesKsX1026_1<I>(StandardizeKoreanSyllablesInner<I, KsX1026_1>);

#[cfg(feature = "ks_x_1026-1")]
impl<I> StandardizeKoreanSyllablesKsX1026_1<I> {
#[inline]
pub(crate) fn new(iter: I) -> Self {
Self(StandardizeKoreanSyllablesInner::new(iter))
}
}

#[cfg(feature = "ks_x_1026-1")]
impl<I: Iterator<Item = char>> Iterator for StandardizeKoreanSyllablesKsX1026_1<I> {
type Item = char;

fn next(&mut self) -> Option<Self::Item> {
self.0.next()
}

fn size_hint(&self) -> (usize, Option<usize>) {
self.0.size_hint()
}
}

#[cfg(feature = "ks_x_1026-1")]
impl<I: Iterator<Item = char> + FusedIterator> FusedIterator
for StandardizeKoreanSyllablesKsX1026_1<I>
{
}
Loading

0 comments on commit 887d390

Please sign in to comment.