Skip to content

Commit

Permalink
Add safety comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Manishearth committed May 7, 2024
1 parent ac8fa20 commit 2c1cdec
Showing 1 changed file with 25 additions and 2 deletions.
27 changes: 25 additions & 2 deletions src/normalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,12 @@ where
}

// Perform decomposition for Hangul
#[allow(unsafe_code)]
if is_hangul_syllable(c) {
decompose_hangul(c, emit_char);
// Safety: Hangul Syllables invariant checked by is_hangul_syllable above
unsafe {
decompose_hangul(c, emit_char);
}
return;
}

Expand Down Expand Up @@ -127,27 +131,37 @@ const T_LAST: u32 = T_BASE + T_COUNT - 1;
// i.e. `T_BASE + 1 ..= T_LAST`.
const T_FIRST: u32 = T_BASE + 1;

// Safety-usable invariant: This ensures that c is a valid Hangul Syllable character (U+AC00..U+D7AF)
pub(crate) fn is_hangul_syllable(c: char) -> bool {
// Safety: This checks the range 0xAC00 (S_BASE) to 0xD7A4 (S_BASE + S_COUNT), upholding the safety-usable invariant
(c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT)
}

// Decompose a precomposed Hangul syllable
// Safety: `s` MUST be a valid Hangul Syllable character, between U+AC00..U+D7AF
#[allow(unsafe_code)]
#[inline(always)]
fn decompose_hangul<F>(s: char, mut emit_char: F)
unsafe fn decompose_hangul<F>(s: char, mut emit_char: F)
where
F: FnMut(char),
{
// This will be at most 0x2baf, the size of the Hangul Syllables block
let s_index = s as u32 - S_BASE;
// This will be at most 0x2baf / (21 * 28), 19
let l_index = s_index / N_COUNT;
unsafe {
// Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800)
emit_char(char::from_u32_unchecked(L_BASE + l_index));

// Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21
let v_index = (s_index % N_COUNT) / T_COUNT;
// Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800)
emit_char(char::from_u32_unchecked(V_BASE + v_index));

// Safety: This will be at most T_COUNT - 1 (27)
let t_index = s_index % T_COUNT;
if t_index > 0 {
// Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800)
emit_char(char::from_u32_unchecked(T_BASE + t_index));
}
}
Expand All @@ -173,14 +187,23 @@ fn compose_hangul(a: char, b: char) -> Option<char> {
match (a, b) {
// Compose a leading consonant and a vowel together into an LV_Syllable
(L_BASE..=L_LAST, V_BASE..=V_LAST) => {
// Safety: based on the above bounds, l_index will be less than or equal to L_COUNT (19)
// and v_index will be <= V_COUNT (21)
let l_index = a - L_BASE;
let v_index = b - V_BASE;
// Safety: This will be <= 19 * (20 * 21) + (21 * 20), which is 8400.
let lv_index = l_index * N_COUNT + v_index * T_COUNT;
// Safety: This is between 0xAC00 and 0xCCD0, which are in range for Hangul Syllables (U+AC00..U+D7AF) and also in range
// for BMP unicode
let s = S_BASE + lv_index;
// Safety: We've verified this is in-range
Some(unsafe { char::from_u32_unchecked(s) })
}
// Compose an LV_Syllable and a trailing consonant into an LVT_Syllable
(S_BASE..=S_LAST, T_FIRST..=T_LAST) if (a - S_BASE) % T_COUNT == 0 => {
// Safety: a is between 0xAC00 and (0xAC00 + 19 * 21 * 28). b - T_BASE is between 0 and 19.
// Adding a number 0 to 19 to a number that is at largest 0xD7A4 will not go out of bounds to 0xD800 (where the
// surrogates start), so this is safe.
Some(unsafe { char::from_u32_unchecked(a + (b - T_BASE)) })
}
_ => None,
Expand Down

0 comments on commit 2c1cdec

Please sign in to comment.