From bdc1428ea745097ea2d12fbc948de80420c77d9d Mon Sep 17 00:00:00 2001 From: Colin Rofls Date: Mon, 17 Jun 2024 13:33:13 -0400 Subject: [PATCH] [text-format] Fix parsing of string literals This renames `next_byte_value` to `next_str_lit_bytes` and may return between 1..=4 bytes per call, representing the variable-length nature of the UTF-8 encoding. --- protobuf-support/src/lexer/lexer_impl.rs | 62 +++++++++++++------ protobuf-support/src/lexer/str_lit.rs | 14 +++-- .../src/common/v2/test_fmt_text_format.rs | 3 +- 3 files changed, 53 insertions(+), 26 deletions(-) diff --git a/protobuf-support/src/lexer/lexer_impl.rs b/protobuf-support/src/lexer/lexer_impl.rs index f0d6a9609..6c18f3a90 100644 --- a/protobuf-support/src/lexer/lexer_impl.rs +++ b/protobuf-support/src/lexer/lexer_impl.rs @@ -67,6 +67,34 @@ impl From for LexerError { } } +/// The raw bytes for a single char or escape sequence in a string literal +/// +/// The raw bytes are available via an `into_iter` implementation. +pub(crate) struct DecodedBytes { + // a single char can be up to 4-bytes when encoded in utf-8 + buf: [u8; 4], + len: usize, +} + +impl DecodedBytes { + fn byte(b: u8) -> DecodedBytes { + DecodedBytes { + buf: [b, 0, 0, 0], + len: 1, + } + } + + fn char(value: char) -> Self { + let mut buf = [0; 4]; + let len = value.encode_utf8(&mut buf).len(); + DecodedBytes { buf, len } + } + + pub(crate) fn bytes(&self) -> &[u8] { + &self.buf[..self.len] + } +} + #[derive(Copy, Clone)] pub struct Lexer<'a> { language: ParserLanguage, @@ -440,24 +468,24 @@ impl<'a> Lexer<'a> { // octEscape = '\' octalDigit octalDigit octalDigit // charEscape = '\' ( "a" | "b" | "f" | "n" | "r" | "t" | "v" | '\' | "'" | '"' ) // quote = "'" | '"' - pub fn next_byte_value(&mut self) -> LexerResult { + pub(crate) fn next_str_lit_bytes(&mut self) -> LexerResult { match self.next_char()? { '\\' => { match self.next_char()? { - '\'' => Ok(b'\''), - '"' => Ok(b'"'), - '\\' => Ok(b'\\'), - 'a' => Ok(b'\x07'), - 'b' => Ok(b'\x08'), - 'f' => Ok(b'\x0c'), - 'n' => Ok(b'\n'), - 'r' => Ok(b'\r'), - 't' => Ok(b'\t'), - 'v' => Ok(b'\x0b'), + '\'' => Ok(DecodedBytes::byte(b'\'')), + '"' => Ok(DecodedBytes::byte(b'"')), + '\\' => Ok(DecodedBytes::byte(b'\\')), + 'a' => Ok(DecodedBytes::byte(b'\x07')), + 'b' => Ok(DecodedBytes::byte(b'\x08')), + 'f' => Ok(DecodedBytes::byte(b'\x0c')), + 'n' => Ok(DecodedBytes::byte(b'\n')), + 'r' => Ok(DecodedBytes::byte(b'\r')), + 't' => Ok(DecodedBytes::byte(b'\t')), + 'v' => Ok(DecodedBytes::byte(b'\x0b')), 'x' => { let d1 = self.next_hex_digit()? as u8; let d2 = self.next_hex_digit()? as u8; - Ok(((d1 << 4) | d2) as u8) + Ok(DecodedBytes::byte((d1 << 4) | d2)) } d if d >= '0' && d <= '7' => { let mut r = d as u8 - b'0'; @@ -467,16 +495,14 @@ impl<'a> Lexer<'a> { Ok(d) => r = (r << 3) + d as u8, } } - Ok(r) + Ok(DecodedBytes::byte(r)) } // https://github.com/google/protobuf/issues/4562 - // TODO: overflow - c => Ok(c as u8), + c => Ok(DecodedBytes::char(c)), } } '\n' | '\0' => Err(LexerError::IncorrectInput), - // TODO: check overflow - c => Ok(c as u8), + c => Ok(DecodedBytes::char(c)), } } @@ -530,7 +556,7 @@ impl<'a> Lexer<'a> { }; first = false; while self.lookahead_char() != Some(q) { - self.next_byte_value()?; + self.next_str_lit_bytes()?; } self.next_char_expect_eq(q)?; diff --git a/protobuf-support/src/lexer/str_lit.rs b/protobuf-support/src/lexer/str_lit.rs index 0e51a16bf..66483bb6b 100644 --- a/protobuf-support/src/lexer/str_lit.rs +++ b/protobuf-support/src/lexer/str_lit.rs @@ -32,10 +32,11 @@ impl StrLit { let mut lexer = Lexer::new(&self.escaped, ParserLanguage::Json); let mut r = Vec::new(); while !lexer.eof() { - r.push( + r.extend( lexer - .next_byte_value() - .map_err(|_| StrLitDecodeError::OtherError)?, + .next_str_lit_bytes() + .map_err(|_| StrLitDecodeError::OtherError)? + .bytes(), ); } Ok(String::from_utf8(r)?) @@ -45,10 +46,11 @@ impl StrLit { let mut lexer = Lexer::new(&self.escaped, ParserLanguage::Json); let mut r = Vec::new(); while !lexer.eof() { - r.push( + r.extend( lexer - .next_byte_value() - .map_err(|_| StrLitDecodeError::OtherError)?, + .next_str_lit_bytes() + .map_err(|_| StrLitDecodeError::OtherError)? + .bytes(), ); } Ok(r) diff --git a/test-crates/protobuf-codegen-protoc-test/src/common/v2/test_fmt_text_format.rs b/test-crates/protobuf-codegen-protoc-test/src/common/v2/test_fmt_text_format.rs index df034ef1a..fb714abe1 100644 --- a/test-crates/protobuf-codegen-protoc-test/src/common/v2/test_fmt_text_format.rs +++ b/test-crates/protobuf-codegen-protoc-test/src/common/v2/test_fmt_text_format.rs @@ -117,8 +117,7 @@ fn test_string_bytes() { #[test] fn non_ascii_strings() { test_text_format_str_descriptor("string_singular: \"À\"", &TestTypes::descriptor()); - // TODO: fix this. - // test_text_format_str_descriptor("string_singular: \"日月\"", &TestTypes::descriptor()); + test_text_format_str_descriptor("string_singular: \"日月\"", &TestTypes::descriptor()); } #[test]