From 4f763467774a3e40f7b43166a48fa062ba54bc9c Mon Sep 17 00:00:00 2001 From: Ethiraric Date: Sat, 23 Mar 2024 16:34:35 +0100 Subject: [PATCH] Switch from `encoding` to `encoding_rs`. See https://github.com/rustsec/advisory-db/issues/1605. --- CHANGELOG.md | 22 ++++++++ Cargo.toml | 2 +- src/yaml.rs | 150 ++++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 155 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c348e51..20c4678 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,27 @@ # Changelog +## Upcoming +### Breaking changes + - The `encoding` library has been replaced with `encoding_rs`. If you use the + `trap` of `YamlDecoder`, this change will make your code not compile. + An additional enum `YamlDecoderTrap` has been added to abstract the + underlying library and avoid breaking changes in the future. This + additionally lifts the `encoding` dependency on _your_ project if you were + using that feature. + - The `encoding::types::DecoderTrap` has been replaced with `YamlDecoderTrap`. + - The signature of the function for `YamlDecoderTrap::Call` has changed: + ```rs + // Before, with `encoding::types::DecoderTrap::Call` + fn(_: &mut encoding::RawDecoder, _: &[u8], _: &mut encoding::StringWriter) -> bool; + // Now, with `YamlDecoderTrap::Call` + fn(_: u8, _: u8, _: &[u8], _: &mut String) -> ControlFlow>; + ``` + Please refer to the `YamlDecoderTrapFn` documentation for more details. + +**Features**: + +**Development**: + ## v0.7.0 **Features**: diff --git a/Cargo.toml b/Cargo.toml index bf96f58..c0bc62e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,7 @@ edition = "2021" [dependencies] arraydeque = "0.5.1" -encoding = "0.2" +encoding_rs = "0.8.33" hashlink = "0.8" [dev-dependencies] diff --git a/src/yaml.rs b/src/yaml.rs index e310795..2ef4376 100644 --- a/src/yaml.rs +++ b/src/yaml.rs @@ -2,8 +2,11 @@ #![allow(clippy::module_name_repetitions)] +use std::borrow::Cow; +use std::ops::ControlFlow; use std::{collections::BTreeMap, convert::TryFrom, mem, ops::Index}; +use encoding_rs::{Decoder, DecoderResult, Encoding}; use hashlink::LinkedHashMap; use crate::parser::{Event, MarkedEventReceiver, Parser, Tag}; @@ -238,11 +241,51 @@ impl YamlLoader { } } +/// The signature of the function to call when using [`YAMLDecodingTrap::Call`]. +/// +/// The arguments are as follows: +/// * `malformation_length`: The length of the sequence the decoder failed to decode. +/// * `bytes_read_after_malformation`: The number of lookahead bytes the decoder consumed after +/// the malformation. +/// * `input_at_malformation`: What the input buffer is at the malformation. +/// This is the buffer starting at the malformation. The first `malformation_length` bytes are +/// the problematic sequence. The following `bytes_read_after_malformation` are already stored +/// in the decoder and will not be re-fed. +/// * `output`: The output string. +/// +/// The function must modify `output` as it feels is best. For instance, one could recreate the +/// behavior of [`YAMLDecodingTrap::Ignore`] with an empty function, [`YAMLDecodingTrap::Replace`] +/// by pushing a `\u{FFFD}` into `output` and [`YAMLDecodingTrap::Strict`] by returning +/// [`ControlFlow::Break`]. +/// +/// # Returns +/// The function must return [`ControlFlow::Continue`] if decoding may continue or +/// [`ControlFlow::Break`] if decoding must be aborted. An optional error string may be supplied. +pub type YAMLDecodingTrapFn = fn( + malformation_length: u8, + bytes_read_after_malformation: u8, + input_at_malformation: &[u8], + output: &mut String, +) -> ControlFlow>; + +/// The behavior [`YamlDecoder`] must have when an decoding error occurs. +#[derive(Copy, Clone, PartialEq, Eq)] +pub enum YAMLDecodingTrap { + /// Ignore the offending bytes, remove them from the output. + Ignore, + /// Error out. + Strict, + /// Replace them with the Unicode REPLACEMENT CHARACTER. + Replace, + /// Call the user-supplied function upon decoding malformation. + Call(YAMLDecodingTrapFn), +} + /// `YamlDecoder` is a `YamlLoader` builder that allows you to supply your own encoding error trap. /// For example, to read a YAML file while ignoring Unicode decoding errors you can set the /// `encoding_trap` to `encoding::DecoderTrap::Ignore`. /// ```rust -/// use yaml_rust2::yaml::YamlDecoder; +/// use yaml_rust2::yaml::{YamlDecoder, YAMLDecodingTrap}; /// /// let string = b"--- /// a\xa9: 1 @@ -250,13 +293,13 @@ impl YamlLoader { /// c: [1, 2] /// "; /// let out = YamlDecoder::read(string as &[u8]) -/// .encoding_trap(encoding::DecoderTrap::Ignore) +/// .encoding_trap(YAMLDecodingTrap::Ignore) /// .decode() /// .unwrap(); /// ``` pub struct YamlDecoder { source: T, - trap: encoding::types::DecoderTrap, + trap: YAMLDecodingTrap, } impl YamlDecoder { @@ -264,12 +307,12 @@ impl YamlDecoder { pub fn read(source: T) -> YamlDecoder { YamlDecoder { source, - trap: encoding::DecoderTrap::Strict, + trap: YAMLDecodingTrap::Strict, } } /// Set the behavior of the decoder when the encoding is invalid. - pub fn encoding_trap(&mut self, trap: encoding::types::DecoderTrap) -> &mut Self { + pub fn encoding_trap(&mut self, trap: YAMLDecodingTrap) -> &mut Self { self.trap = trap; self } @@ -282,13 +325,84 @@ impl YamlDecoder { let mut buffer = Vec::new(); self.source.read_to_end(&mut buffer)?; - // Decodes the input buffer using either UTF-8, UTF-16LE or UTF-16BE depending on the BOM codepoint. - // If the buffer doesn't start with a BOM codepoint, it will use a fallback encoding obtained by - // detect_utf16_endianness. - let (res, _) = - encoding::types::decode(&buffer, self.trap, detect_utf16_endianness(&buffer)); - let s = res.map_err(LoadError::Decode)?; - YamlLoader::load_from_str(&s).map_err(LoadError::Scan) + // Check if the `encoding` library can detect encoding from the BOM, otherwise use + // `detect_utf16_endianness`. + let (encoding, _) = + Encoding::for_bom(&buffer).unwrap_or_else(|| (detect_utf16_endianness(&buffer), 2)); + let mut decoder = encoding.new_decoder(); + let mut output = String::new(); + + // Decode the input buffer. + decode_loop(&buffer, &mut output, &mut decoder, self.trap)?; + + YamlLoader::load_from_str(&output).map_err(LoadError::Scan) + } +} + +/// Perform a loop of [`Decoder::decode_to_string`], reallocating `output` if needed. +fn decode_loop( + input: &[u8], + output: &mut String, + decoder: &mut Decoder, + trap: YAMLDecodingTrap, +) -> Result<(), LoadError> { + output.reserve(input.len()); + let mut total_bytes_read = 0; + + loop { + match decoder.decode_to_string_without_replacement(&input[total_bytes_read..], output, true) + { + // If the input is empty, we processed the whole input. + (DecoderResult::InputEmpty, _) => break Ok(()), + // If the output is full, we must reallocate. + (DecoderResult::OutputFull, bytes_read) => { + total_bytes_read += bytes_read; + // The output is already reserved to the size of the input. We slowly resize. Here, + // we're expecting that 10% of bytes will double in size when converting to UTF-8. + output.reserve(input.len() / 10); + } + (DecoderResult::Malformed(malformed_len, bytes_after_malformed), bytes_read) => { + total_bytes_read += bytes_read; + match trap { + // Ignore (skip over) malformed character. + YAMLDecodingTrap::Ignore => {} + // Replace them with the Unicode REPLACEMENT CHARACTER. + YAMLDecodingTrap::Replace => { + output.push('\u{FFFD}'); + } + // Otherwise error, getting as much context as possible. + YAMLDecodingTrap::Strict => { + let malformed_len = malformed_len as usize; + let bytes_after_malformed = bytes_after_malformed as usize; + let byte_idx = total_bytes_read - (malformed_len + bytes_after_malformed); + let malformed_sequence = &input[byte_idx..byte_idx + malformed_len]; + + break Err(LoadError::Decode(Cow::Owned(format!( + "Invalid character sequence at {byte_idx}: {malformed_sequence:?}", + )))); + } + YAMLDecodingTrap::Call(callback) => { + let byte_idx = + total_bytes_read - ((malformed_len + bytes_after_malformed) as usize); + let malformed_sequence = + &input[byte_idx..byte_idx + malformed_len as usize]; + if let ControlFlow::Break(error) = callback( + malformed_len, + bytes_after_malformed, + &input[byte_idx..], + output, + ) { + if error.is_empty() { + break Err(LoadError::Decode(Cow::Owned(format!( + "Invalid character sequence at {byte_idx}: {malformed_sequence:?}", + )))); + } + break Err(LoadError::Decode(error)); + } + } + } + } + } } } @@ -301,15 +415,15 @@ impl YamlDecoder { /// This allows the encoding to be deduced by the pattern of null (#x00) characters. // /// See spec at -fn detect_utf16_endianness(b: &[u8]) -> encoding::types::EncodingRef { +fn detect_utf16_endianness(b: &[u8]) -> &'static Encoding { if b.len() > 1 && (b[0] != b[1]) { if b[0] == 0 { - return encoding::all::UTF_16BE; + return encoding_rs::UTF_16BE; } else if b[1] == 0 { - return encoding::all::UTF_16LE; + return encoding_rs::UTF_16LE; } } - encoding::all::UTF_8 + encoding_rs::UTF_8 } macro_rules! define_as ( @@ -550,7 +664,7 @@ impl Iterator for YamlIter { #[cfg(test)] mod test { - use super::{Yaml, YamlDecoder}; + use super::{YAMLDecodingTrap, Yaml, YamlDecoder}; #[test] fn test_read_bom() { @@ -623,7 +737,7 @@ b: 2.2 c: [1, 2] "; let out = YamlDecoder::read(s as &[u8]) - .encoding_trap(encoding::DecoderTrap::Ignore) + .encoding_trap(YAMLDecodingTrap::Ignore) .decode() .unwrap(); let doc = &out[0];