From 165d475d9cfddb3207b32906802abf36af37f3b5 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Tue, 22 Oct 2024 13:38:14 -0500 Subject: [PATCH 1/4] printf: accept non-UTF-8 input in FORMAT and ARGUMENT arguments Other implementations of `printf` permit arbitrary data to be passed to `printf`. The only restriction is that a null byte terminates FORMAT and ARGUMENT argument strings (since they are C strings). The current implementation only accepts FORMAT and ARGUMENT arguments that are valid UTF-8 (this is being enforced by clap). This commit removes the UTF-8 validation by switching to OsStr and OsString. This allows users to use `printf` to transmit or reformat null-safe but not UTF-8-safe data, such as text encoded in an 8-bit text encoding. See the `non_utf_8_input` test for an example (ISO-8859-1 text). --- src/uu/echo/Cargo.toml | 2 +- src/uu/echo/src/echo.rs | 31 +------ src/uu/printf/src/printf.rs | 29 ++++-- .../src/lib/features/format/argument.rs | 88 ++++++++++++++++--- src/uucore/src/lib/features/format/spec.rs | 32 ++++--- tests/by-util/test_printf.rs | 26 ++++++ 6 files changed, 146 insertions(+), 62 deletions(-) diff --git a/src/uu/echo/Cargo.toml b/src/uu/echo/Cargo.toml index faabc121df1..7b2713ad795 100644 --- a/src/uu/echo/Cargo.toml +++ b/src/uu/echo/Cargo.toml @@ -18,7 +18,7 @@ path = "src/echo.rs" [dependencies] clap = { workspace = true } -uucore = { workspace = true } +uucore = { workspace = true, features = ["format"] } [[bin]] name = "echo" diff --git a/src/uu/echo/src/echo.rs b/src/uu/echo/src/echo.rs index 746cdd7c59e..fdc6c245607 100644 --- a/src/uu/echo/src/echo.rs +++ b/src/uu/echo/src/echo.rs @@ -6,12 +6,12 @@ use clap::builder::ValueParser; use clap::parser::ValuesRef; use clap::{crate_version, Arg, ArgAction, Command}; -use std::ffi::{OsStr, OsString}; +use std::ffi::OsString; use std::io::{self, StdoutLock, Write}; use std::iter::Peekable; use std::ops::ControlFlow; use std::slice::Iter; -use uucore::error::{UResult, USimpleError}; +use uucore::error::UResult; use uucore::{format_usage, help_about, help_section, help_usage}; const ABOUT: &str = help_about!("echo.md"); @@ -355,12 +355,7 @@ fn execute( arguments_after_options: ValuesRef<'_, OsString>, ) -> UResult<()> { for (i, input) in arguments_after_options.enumerate() { - let Some(bytes) = bytes_from_os_string(input.as_os_str()) else { - return Err(USimpleError::new( - 1, - "Non-UTF-8 arguments provided, but this platform does not support them", - )); - }; + let bytes = uucore::format::bytes_from_os_str(input)?; if i > 0 { stdout_lock.write_all(b" ")?; @@ -381,23 +376,3 @@ fn execute( Ok(()) } - -fn bytes_from_os_string(input: &OsStr) -> Option<&[u8]> { - let option = { - #[cfg(target_family = "unix")] - { - use std::os::unix::ffi::OsStrExt; - - Some(input.as_bytes()) - } - - #[cfg(not(target_family = "unix"))] - { - // TODO - // Verify that this works correctly on these platforms - input.to_str().map(|st| st.as_bytes()) - } - }; - - option -} diff --git a/src/uu/printf/src/printf.rs b/src/uu/printf/src/printf.rs index f86b7bd9fa2..217d3ef6181 100644 --- a/src/uu/printf/src/printf.rs +++ b/src/uu/printf/src/printf.rs @@ -5,11 +5,13 @@ #![allow(dead_code)] +use clap::builder::ValueParser; use clap::{crate_version, Arg, ArgAction, Command}; +use std::ffi::OsString; use std::io::stdout; use std::ops::ControlFlow; use uucore::error::{UResult, UUsageError}; -use uucore::format::{parse_spec_and_escape, FormatArgument, FormatItem}; +use uucore::format::{bytes_from_os_str, parse_spec_and_escape, FormatArgument, FormatItem}; use uucore::{format_usage, help_about, help_section, help_usage}; const VERSION: &str = "version"; @@ -28,17 +30,22 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let matches = uu_app().get_matches_from(args); let format = matches - .get_one::(options::FORMAT) + .get_one::(options::FORMAT) .ok_or_else(|| UUsageError::new(1, "missing operand"))?; - let values: Vec<_> = match matches.get_many::(options::ARGUMENT) { - Some(s) => s.map(|s| FormatArgument::Unparsed(s.to_string())).collect(), - None => vec![], + let format_bytes = bytes_from_os_str(format)?; + + let values = match matches.get_many::(options::ARGUMENT) { + Some(os_string) => os_string + .map(|os_string_ref| FormatArgument::Unparsed(os_string_ref.to_owned())) + .collect(), + None => Vec::::new(), }; let mut format_seen = false; let mut args = values.iter().peekable(); - for item in parse_spec_and_escape(format.as_ref()) { + + for item in parse_spec_and_escape(format_bytes) { if let Ok(FormatItem::Spec(_)) = item { format_seen = true; } @@ -55,7 +62,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { } while args.peek().is_some() { - for item in parse_spec_and_escape(format.as_ref()) { + for item in parse_spec_and_escape(format_bytes) { match item?.write(stdout(), &mut args)? { ControlFlow::Continue(()) => {} ControlFlow::Break(()) => return Ok(()), @@ -86,6 +93,10 @@ pub fn uu_app() -> Command { .help("Print version information") .action(ArgAction::Version), ) - .arg(Arg::new(options::FORMAT)) - .arg(Arg::new(options::ARGUMENT).action(ArgAction::Append)) + .arg(Arg::new(options::FORMAT).value_parser(ValueParser::os_string())) + .arg( + Arg::new(options::ARGUMENT) + .action(ArgAction::Append) + .value_parser(ValueParser::os_string()), + ) } diff --git a/src/uucore/src/lib/features/format/argument.rs b/src/uucore/src/lib/features/format/argument.rs index 75851049895..5ec847a9549 100644 --- a/src/uucore/src/lib/features/format/argument.rs +++ b/src/uucore/src/lib/features/format/argument.rs @@ -4,13 +4,13 @@ // file that was distributed with this source code. use crate::{ - error::set_exit_code, + error::{set_exit_code, UResult, USimpleError}, features::format::num_parser::{ParseError, ParsedNumber}, quoting_style::{escape_name, Quotes, QuotingStyle}, - show_error, show_warning, + show, show_error, show_warning, }; use os_display::Quotable; -use std::ffi::OsStr; +use std::ffi::{OsStr, OsString}; /// An argument for formatting /// @@ -22,12 +22,12 @@ use std::ffi::OsStr; #[derive(Clone, Debug)] pub enum FormatArgument { Char(char), - String(String), + String(OsString), UnsignedInt(u64), SignedInt(i64), Float(f64), /// Special argument that gets coerced into the other variants - Unparsed(String), + Unparsed(OsString), } pub trait ArgumentIter<'a>: Iterator { @@ -35,7 +35,7 @@ pub trait ArgumentIter<'a>: Iterator { fn get_i64(&mut self) -> i64; fn get_u64(&mut self) -> u64; fn get_f64(&mut self) -> f64; - fn get_str(&mut self) -> &'a str; + fn get_str(&mut self) -> &'a OsStr; } impl<'a, T: Iterator> ArgumentIter<'a> for T { @@ -45,7 +45,10 @@ impl<'a, T: Iterator> ArgumentIter<'a> for T { }; match next { FormatArgument::Char(c) => *c as u8, - FormatArgument::Unparsed(s) => s.bytes().next().unwrap_or(b'\0'), + FormatArgument::Unparsed(os) => match bytes_from_os_str(os).unwrap().first() { + Some(&byte) => byte, + None => b'\0', + }, _ => b'\0', } } @@ -56,7 +59,11 @@ impl<'a, T: Iterator> ArgumentIter<'a> for T { }; match next { FormatArgument::UnsignedInt(n) => *n, - FormatArgument::Unparsed(s) => extract_value(ParsedNumber::parse_u64(s), s), + FormatArgument::Unparsed(os) => { + let str = get_str_or_exit_with_error(os); + + extract_value(ParsedNumber::parse_u64(str), str) + } _ => 0, } } @@ -67,7 +74,11 @@ impl<'a, T: Iterator> ArgumentIter<'a> for T { }; match next { FormatArgument::SignedInt(n) => *n, - FormatArgument::Unparsed(s) => extract_value(ParsedNumber::parse_i64(s), s), + FormatArgument::Unparsed(os) => { + let str = get_str_or_exit_with_error(os); + + extract_value(ParsedNumber::parse_i64(str), str) + } _ => 0, } } @@ -78,15 +89,19 @@ impl<'a, T: Iterator> ArgumentIter<'a> for T { }; match next { FormatArgument::Float(n) => *n, - FormatArgument::Unparsed(s) => extract_value(ParsedNumber::parse_f64(s), s), + FormatArgument::Unparsed(os) => { + let str = get_str_or_exit_with_error(os); + + extract_value(ParsedNumber::parse_f64(str), str) + } _ => 0.0, } } - fn get_str(&mut self) -> &'a str { + fn get_str(&mut self) -> &'a OsStr { match self.next() { - Some(FormatArgument::Unparsed(s) | FormatArgument::String(s)) => s, - _ => "", + Some(FormatArgument::Unparsed(os) | FormatArgument::String(os)) => os, + _ => "".as_ref(), } } } @@ -126,3 +141,50 @@ fn extract_value(p: Result>, input: &str) -> T } } } + +pub fn bytes_from_os_str(input: &OsStr) -> UResult<&[u8]> { + let result = { + #[cfg(target_family = "unix")] + { + use std::os::unix::ffi::OsStrExt; + + Ok(input.as_bytes()) + } + + #[cfg(not(target_family = "unix"))] + { + use crate::error::USimpleError; + + // TODO + // Verify that this works correctly on these platforms + match input.to_str().map(|st| st.as_bytes()) { + Some(sl) => Ok(sl), + None => Err(USimpleError::new( + 1, + "non-UTF-8 string encountered when not allowed", + )), + } + } + }; + + result +} + +fn get_str_or_exit_with_error(os_str: &OsStr) -> &str { + match os_str.to_str() { + Some(st) => st, + None => { + let cow = os_str.to_string_lossy(); + + let quoted = cow.quote(); + + let error = format!( + "argument like {quoted} is not a valid UTF-8 string, and could not be parsed as an integer", + ); + + show!(USimpleError::new(1, error.clone())); + + panic!("{error}"); + } + } +} diff --git a/src/uucore/src/lib/features/format/spec.rs b/src/uucore/src/lib/features/format/spec.rs index 581e1fa0624..e3b88fd68d4 100644 --- a/src/uucore/src/lib/features/format/spec.rs +++ b/src/uucore/src/lib/features/format/spec.rs @@ -5,15 +5,15 @@ // spell-checker:ignore (vars) intmax ptrdiff padlen -use crate::quoting_style::{escape_name, QuotingStyle}; - use super::{ + bytes_from_os_str, num_format::{ self, Case, FloatVariant, ForceDecimal, Formatter, NumberAlignment, PositiveSign, Prefix, UnsignedIntVariant, }, parse_escape_only, ArgumentIter, FormatChar, FormatError, }; +use crate::quoting_style::{escape_name, QuotingStyle}; use std::{io::Write, ops::ControlFlow}; /// A parsed specification for formatting a value @@ -331,17 +331,26 @@ impl Spec { // TODO: We need to not use Rust's formatting for aligning the output, // so that we can just write bytes to stdout without panicking. let precision = resolve_asterisk(*precision, &mut args)?; - let s = args.get_str(); + + let os_str = args.get_str(); + + let bytes = bytes_from_os_str(os_str).unwrap(); + let truncated = match precision { - Some(p) if p < s.len() => &s[..p], - _ => s, + Some(p) if p < os_str.len() => &bytes[..p], + _ => bytes, }; - write_padded(writer, truncated.as_bytes(), width, *align_left) + + write_padded(writer, truncated, width, *align_left) } Self::EscapedString => { - let s = args.get_str(); - let mut parsed = Vec::new(); - for c in parse_escape_only(s.as_bytes()) { + let os_str = args.get_str(); + + let bytes = bytes_from_os_str(os_str).unwrap(); + + let mut parsed = Vec::::new(); + + for c in parse_escape_only(bytes) { match c.write(&mut parsed)? { ControlFlow::Continue(()) => {} ControlFlow::Break(()) => { @@ -353,11 +362,12 @@ impl Spec { writer.write_all(&parsed).map_err(FormatError::IoError) } Self::QuotedString => { - let s = args.get_str(); + let os = args.get_str(); + writer .write_all( escape_name( - s.as_ref(), + os, &QuotingStyle::Shell { escape: true, always_quote: false, diff --git a/tests/by-util/test_printf.rs b/tests/by-util/test_printf.rs index 29a8cc9140f..41a247dc9d2 100644 --- a/tests/by-util/test_printf.rs +++ b/tests/by-util/test_printf.rs @@ -916,3 +916,29 @@ fn float_flag_position_space_padding() { .succeeds() .stdout_only(" +1.0"); } + +#[test] +#[cfg(target_family = "unix")] +fn non_utf_8_input() { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + // ISO-8859-1 encoded text + // spell-checker:disable + const INPUT_AND_OUTPUT: &[u8] = + b"Swer an rehte g\xFCete wendet s\xEEn gem\xFCete, dem volget s\xE6lde und \xEAre."; + // spell-checker:enable + + let os_str = OsStr::from_bytes(INPUT_AND_OUTPUT); + + new_ucmd!() + .arg("%s") + .arg(os_str) + .succeeds() + .stdout_only_bytes(INPUT_AND_OUTPUT); + + new_ucmd!() + .arg(os_str) + .succeeds() + .stdout_only_bytes(INPUT_AND_OUTPUT); +} From f0bad16719fb7ab52934517dce41afd365ccf9e9 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Tue, 22 Oct 2024 15:39:04 -0500 Subject: [PATCH 2/4] Fix error handling. Add test. --- src/uu/echo/src/echo.rs | 2 +- src/uu/printf/src/printf.rs | 6 +- .../src/lib/features/format/argument.rs | 128 ++++++++++-------- src/uucore/src/lib/features/format/mod.rs | 15 ++ src/uucore/src/lib/features/format/spec.rs | 17 ++- tests/by-util/test_printf.rs | 7 + 6 files changed, 105 insertions(+), 70 deletions(-) diff --git a/src/uu/echo/src/echo.rs b/src/uu/echo/src/echo.rs index fdc6c245607..4ed003854f7 100644 --- a/src/uu/echo/src/echo.rs +++ b/src/uu/echo/src/echo.rs @@ -355,7 +355,7 @@ fn execute( arguments_after_options: ValuesRef<'_, OsString>, ) -> UResult<()> { for (i, input) in arguments_after_options.enumerate() { - let bytes = uucore::format::bytes_from_os_str(input)?; + let bytes = uucore::format::try_get_bytes_from_os_str(input)?; if i > 0 { stdout_lock.write_all(b" ")?; diff --git a/src/uu/printf/src/printf.rs b/src/uu/printf/src/printf.rs index 217d3ef6181..eac02ae6b86 100644 --- a/src/uu/printf/src/printf.rs +++ b/src/uu/printf/src/printf.rs @@ -11,7 +11,9 @@ use std::ffi::OsString; use std::io::stdout; use std::ops::ControlFlow; use uucore::error::{UResult, UUsageError}; -use uucore::format::{bytes_from_os_str, parse_spec_and_escape, FormatArgument, FormatItem}; +use uucore::format::{ + parse_spec_and_escape, try_get_bytes_from_os_str, FormatArgument, FormatItem, +}; use uucore::{format_usage, help_about, help_section, help_usage}; const VERSION: &str = "version"; @@ -33,7 +35,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { .get_one::(options::FORMAT) .ok_or_else(|| UUsageError::new(1, "missing operand"))?; - let format_bytes = bytes_from_os_str(format)?; + let format_bytes = try_get_bytes_from_os_str(format)?; let values = match matches.get_many::(options::ARGUMENT) { Some(os_string) => os_string diff --git a/src/uucore/src/lib/features/format/argument.rs b/src/uucore/src/lib/features/format/argument.rs index 5ec847a9549..090817acaa2 100644 --- a/src/uucore/src/lib/features/format/argument.rs +++ b/src/uucore/src/lib/features/format/argument.rs @@ -3,14 +3,19 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. +use super::FormatError; use crate::{ - error::{set_exit_code, UResult, USimpleError}, + error::{set_exit_code, UError}, features::format::num_parser::{ParseError, ParsedNumber}, quoting_style::{escape_name, Quotes, QuotingStyle}, - show, show_error, show_warning, + show_error, show_warning, }; use os_display::Quotable; -use std::ffi::{OsStr, OsString}; +use std::{ + error::Error, + ffi::{OsStr, OsString}, + fmt::Display, +}; /// An argument for formatting /// @@ -31,70 +36,70 @@ pub enum FormatArgument { } pub trait ArgumentIter<'a>: Iterator { - fn get_char(&mut self) -> u8; - fn get_i64(&mut self) -> i64; - fn get_u64(&mut self) -> u64; - fn get_f64(&mut self) -> f64; + fn get_char(&mut self) -> Result; + fn get_i64(&mut self) -> Result; + fn get_u64(&mut self) -> Result; + fn get_f64(&mut self) -> Result; fn get_str(&mut self) -> &'a OsStr; } impl<'a, T: Iterator> ArgumentIter<'a> for T { - fn get_char(&mut self) -> u8 { + fn get_char(&mut self) -> Result { let Some(next) = self.next() else { - return b'\0'; + return Ok(b'\0'); }; match next { - FormatArgument::Char(c) => *c as u8, - FormatArgument::Unparsed(os) => match bytes_from_os_str(os).unwrap().first() { - Some(&byte) => byte, - None => b'\0', + FormatArgument::Char(c) => Ok(*c as u8), + FormatArgument::Unparsed(os) => match try_get_bytes_from_os_str(os)?.first() { + Some(&byte) => Ok(byte), + None => Ok(b'\0'), }, - _ => b'\0', + _ => Ok(b'\0'), } } - fn get_u64(&mut self) -> u64 { + fn get_u64(&mut self) -> Result { let Some(next) = self.next() else { - return 0; + return Ok(0); }; match next { - FormatArgument::UnsignedInt(n) => *n, + FormatArgument::UnsignedInt(n) => Ok(*n), FormatArgument::Unparsed(os) => { - let str = get_str_or_exit_with_error(os); + let str = try_get_str_from_os_str(os)?; - extract_value(ParsedNumber::parse_u64(str), str) + Ok(extract_value(ParsedNumber::parse_u64(str), str)) } - _ => 0, + _ => Ok(0), } } - fn get_i64(&mut self) -> i64 { + fn get_i64(&mut self) -> Result { let Some(next) = self.next() else { - return 0; + return Ok(0); }; match next { - FormatArgument::SignedInt(n) => *n, + FormatArgument::SignedInt(n) => Ok(*n), FormatArgument::Unparsed(os) => { - let str = get_str_or_exit_with_error(os); + let str = try_get_str_from_os_str(os)?; - extract_value(ParsedNumber::parse_i64(str), str) + Ok(extract_value(ParsedNumber::parse_i64(str), str)) } - _ => 0, + _ => Ok(0), } } - fn get_f64(&mut self) -> f64 { + fn get_f64(&mut self) -> Result { let Some(next) = self.next() else { - return 0.0; + return Ok(0.0); }; match next { - FormatArgument::Float(n) => *n, + FormatArgument::Float(n) => Ok(*n), FormatArgument::Unparsed(os) => { - let str = get_str_or_exit_with_error(os); + let str = try_get_str_from_os_str(os)?; - extract_value(ParsedNumber::parse_f64(str), str) + Ok(extract_value(ParsedNumber::parse_f64(str), str)) } - _ => 0.0, + _ => Ok(0.0), } } @@ -135,6 +140,7 @@ fn extract_value(p: Result>, input: &str) -> T } else { show_error!("{}: value not completely converted", input.quote()); } + v } } @@ -142,7 +148,33 @@ fn extract_value(p: Result>, input: &str) -> T } } -pub fn bytes_from_os_str(input: &OsStr) -> UResult<&[u8]> { +#[derive(Debug)] +pub struct NonUtf8OsStr(pub String); + +impl Display for NonUtf8OsStr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_fmt(format_args!( + "invalid (non-UTF-8) string like {} encountered", + self.0.quote(), + )) + } +} + +impl Error for NonUtf8OsStr {} +impl UError for NonUtf8OsStr {} + +pub fn try_get_str_from_os_str(os_str: &OsStr) -> Result<&str, NonUtf8OsStr> { + match os_str.to_str() { + Some(st) => Ok(st), + None => { + let cow = os_str.to_string_lossy(); + + Err(NonUtf8OsStr(cow.to_string())) + } + } +} + +pub fn try_get_bytes_from_os_str(input: &OsStr) -> Result<&[u8], NonUtf8OsStr> { let result = { #[cfg(target_family = "unix")] { @@ -153,38 +185,18 @@ pub fn bytes_from_os_str(input: &OsStr) -> UResult<&[u8]> { #[cfg(not(target_family = "unix"))] { - use crate::error::USimpleError; - // TODO // Verify that this works correctly on these platforms match input.to_str().map(|st| st.as_bytes()) { Some(sl) => Ok(sl), - None => Err(USimpleError::new( - 1, - "non-UTF-8 string encountered when not allowed", - )), + None => { + let cow = input.to_string_lossy(); + + Err(NonUtf8OsStr(cow.to_string())) + } } } }; result } - -fn get_str_or_exit_with_error(os_str: &OsStr) -> &str { - match os_str.to_str() { - Some(st) => st, - None => { - let cow = os_str.to_string_lossy(); - - let quoted = cow.quote(); - - let error = format!( - "argument like {quoted} is not a valid UTF-8 string, and could not be parsed as an integer", - ); - - show!(USimpleError::new(1, error.clone())); - - panic!("{error}"); - } - } -} diff --git a/src/uucore/src/lib/features/format/mod.rs b/src/uucore/src/lib/features/format/mod.rs index 6d7a2ee3079..2bc6db16bfe 100644 --- a/src/uucore/src/lib/features/format/mod.rs +++ b/src/uucore/src/lib/features/format/mod.rs @@ -38,6 +38,7 @@ pub mod num_parser; mod spec; pub use argument::*; +use os_display::Quotable; use spec::Spec; use std::{ error::Error, @@ -63,6 +64,7 @@ pub enum FormatError { NeedAtLeastOneSpec(Vec), WrongSpecType, InvalidPrecision(String), + InvalidEncoding(NonUtf8OsStr), } impl Error for FormatError {} @@ -74,6 +76,12 @@ impl From for FormatError { } } +impl From for FormatError { + fn from(value: NonUtf8OsStr) -> FormatError { + FormatError::InvalidEncoding(value) + } +} + impl Display for FormatError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { @@ -98,6 +106,13 @@ impl Display for FormatError { Self::IoError(_) => write!(f, "io error"), Self::NoMoreArguments => write!(f, "no more arguments"), Self::InvalidArgument(_) => write!(f, "invalid argument"), + Self::InvalidEncoding(no) => { + write!( + f, + "invalid (non-UTF-8) argument like {} encountered", + no.0.quote() + ) + } } } } diff --git a/src/uucore/src/lib/features/format/spec.rs b/src/uucore/src/lib/features/format/spec.rs index e3b88fd68d4..fa8ab2ac196 100644 --- a/src/uucore/src/lib/features/format/spec.rs +++ b/src/uucore/src/lib/features/format/spec.rs @@ -6,12 +6,11 @@ // spell-checker:ignore (vars) intmax ptrdiff padlen use super::{ - bytes_from_os_str, num_format::{ self, Case, FloatVariant, ForceDecimal, Formatter, NumberAlignment, PositiveSign, Prefix, UnsignedIntVariant, }, - parse_escape_only, ArgumentIter, FormatChar, FormatError, + parse_escape_only, try_get_bytes_from_os_str, ArgumentIter, FormatChar, FormatError, }; use crate::quoting_style::{escape_name, QuotingStyle}; use std::{io::Write, ops::ControlFlow}; @@ -315,7 +314,7 @@ impl Spec { match self { Self::Char { width, align_left } => { let width = resolve_asterisk(*width, &mut args)?.unwrap_or(0); - write_padded(writer, &[args.get_char()], width, *align_left) + write_padded(writer, &[args.get_char()?], width, *align_left) } Self::String { width, @@ -334,7 +333,7 @@ impl Spec { let os_str = args.get_str(); - let bytes = bytes_from_os_str(os_str).unwrap(); + let bytes = try_get_bytes_from_os_str(os_str).unwrap(); let truncated = match precision { Some(p) if p < os_str.len() => &bytes[..p], @@ -346,7 +345,7 @@ impl Spec { Self::EscapedString => { let os_str = args.get_str(); - let bytes = bytes_from_os_str(os_str).unwrap(); + let bytes = try_get_bytes_from_os_str(os_str).unwrap(); let mut parsed = Vec::::new(); @@ -386,7 +385,7 @@ impl Spec { } => { let width = resolve_asterisk(*width, &mut args)?.unwrap_or(0); let precision = resolve_asterisk(*precision, &mut args)?.unwrap_or(0); - let i = args.get_i64(); + let i = args.get_i64()?; if precision as u64 > i32::MAX as u64 { return Err(FormatError::InvalidPrecision(precision.to_string())); @@ -409,7 +408,7 @@ impl Spec { } => { let width = resolve_asterisk(*width, &mut args)?.unwrap_or(0); let precision = resolve_asterisk(*precision, &mut args)?.unwrap_or(0); - let i = args.get_u64(); + let i = args.get_u64()?; if precision as u64 > i32::MAX as u64 { return Err(FormatError::InvalidPrecision(precision.to_string())); @@ -435,7 +434,7 @@ impl Spec { } => { let width = resolve_asterisk(*width, &mut args)?.unwrap_or(0); let precision = resolve_asterisk(*precision, &mut args)?.unwrap_or(6); - let f = args.get_f64(); + let f = args.get_f64()?; if precision as u64 > i32::MAX as u64 { return Err(FormatError::InvalidPrecision(precision.to_string())); @@ -463,7 +462,7 @@ fn resolve_asterisk<'a>( ) -> Result, FormatError> { Ok(match option { None => None, - Some(CanAsterisk::Asterisk) => Some(usize::try_from(args.get_u64()).ok().unwrap_or(0)), + Some(CanAsterisk::Asterisk) => Some(usize::try_from(args.get_u64()?).ok().unwrap_or(0)), Some(CanAsterisk::Fixed(w)) => Some(w), }) } diff --git a/tests/by-util/test_printf.rs b/tests/by-util/test_printf.rs index 41a247dc9d2..be1a10093f1 100644 --- a/tests/by-util/test_printf.rs +++ b/tests/by-util/test_printf.rs @@ -941,4 +941,11 @@ fn non_utf_8_input() { .arg(os_str) .succeeds() .stdout_only_bytes(INPUT_AND_OUTPUT); + + new_ucmd!() + .arg("%d") + .arg(os_str) + .fails() + .stderr_only("printf: invalid (non-UTF-8) argument like 'Swer an rehte g�ete wendet s�n gem�ete, dem volget s�lde und �re.' encountered +"); } From 38a8625a10a5d9a23f1cdc7cda0b0f1b241a0605 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Tue, 22 Oct 2024 16:13:56 -0500 Subject: [PATCH 3/4] Address spelling CI issue --- tests/by-util/test_printf.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/by-util/test_printf.rs b/tests/by-util/test_printf.rs index be1a10093f1..da35501f03e 100644 --- a/tests/by-util/test_printf.rs +++ b/tests/by-util/test_printf.rs @@ -946,6 +946,8 @@ fn non_utf_8_input() { .arg("%d") .arg(os_str) .fails() + // spell-checker:disable .stderr_only("printf: invalid (non-UTF-8) argument like 'Swer an rehte g�ete wendet s�n gem�ete, dem volget s�lde und �re.' encountered "); + // spell-checker:enable } From f2d050ee84ca4694b80793241621e48c16c38c87 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Thu, 24 Oct 2024 12:06:47 -0500 Subject: [PATCH 4/4] Restructure non-UTF-8 input error handling --- src/uu/echo/Cargo.toml | 2 +- src/uu/echo/src/echo.rs | 5 +- src/uu/printf/src/printf.rs | 8 +- .../src/lib/features/format/argument.rs | 70 +--------- src/uucore/src/lib/features/format/mod.rs | 39 +++--- src/uucore/src/lib/features/format/spec.rs | 12 +- src/uucore/src/lib/lib.rs | 125 +++++++++++++++--- 7 files changed, 147 insertions(+), 114 deletions(-) diff --git a/src/uu/echo/Cargo.toml b/src/uu/echo/Cargo.toml index 7b2713ad795..faabc121df1 100644 --- a/src/uu/echo/Cargo.toml +++ b/src/uu/echo/Cargo.toml @@ -18,7 +18,7 @@ path = "src/echo.rs" [dependencies] clap = { workspace = true } -uucore = { workspace = true, features = ["format"] } +uucore = { workspace = true } [[bin]] name = "echo" diff --git a/src/uu/echo/src/echo.rs b/src/uu/echo/src/echo.rs index 4ed003854f7..1e0b5e3de51 100644 --- a/src/uu/echo/src/echo.rs +++ b/src/uu/echo/src/echo.rs @@ -12,7 +12,7 @@ use std::iter::Peekable; use std::ops::ControlFlow; use std::slice::Iter; use uucore::error::UResult; -use uucore::{format_usage, help_about, help_section, help_usage}; +use uucore::{format_usage, help_about, help_section, help_usage, os_str_as_bytes_verbose}; const ABOUT: &str = help_about!("echo.md"); const USAGE: &str = help_usage!("echo.md"); @@ -355,8 +355,9 @@ fn execute( arguments_after_options: ValuesRef<'_, OsString>, ) -> UResult<()> { for (i, input) in arguments_after_options.enumerate() { - let bytes = uucore::format::try_get_bytes_from_os_str(input)?; + let bytes = os_str_as_bytes_verbose(input)?; + // Don't print a space before the first argument if i > 0 { stdout_lock.write_all(b" ")?; } diff --git a/src/uu/printf/src/printf.rs b/src/uu/printf/src/printf.rs index eac02ae6b86..5bf8853a915 100644 --- a/src/uu/printf/src/printf.rs +++ b/src/uu/printf/src/printf.rs @@ -11,10 +11,8 @@ use std::ffi::OsString; use std::io::stdout; use std::ops::ControlFlow; use uucore::error::{UResult, UUsageError}; -use uucore::format::{ - parse_spec_and_escape, try_get_bytes_from_os_str, FormatArgument, FormatItem, -}; -use uucore::{format_usage, help_about, help_section, help_usage}; +use uucore::format::{parse_spec_and_escape, FormatArgument, FormatError, FormatItem}; +use uucore::{format_usage, help_about, help_section, help_usage, os_str_as_bytes_verbose}; const VERSION: &str = "version"; const HELP: &str = "help"; @@ -35,7 +33,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { .get_one::(options::FORMAT) .ok_or_else(|| UUsageError::new(1, "missing operand"))?; - let format_bytes = try_get_bytes_from_os_str(format)?; + let format_bytes = os_str_as_bytes_verbose(format).map_err(FormatError::from)?; let values = match matches.get_many::(options::ARGUMENT) { Some(os_string) => os_string diff --git a/src/uucore/src/lib/features/format/argument.rs b/src/uucore/src/lib/features/format/argument.rs index 090817acaa2..9811f9ad591 100644 --- a/src/uucore/src/lib/features/format/argument.rs +++ b/src/uucore/src/lib/features/format/argument.rs @@ -5,17 +5,14 @@ use super::FormatError; use crate::{ - error::{set_exit_code, UError}, + error::set_exit_code, features::format::num_parser::{ParseError, ParsedNumber}, + os_str_as_bytes_verbose, os_str_as_str_verbose, quoting_style::{escape_name, Quotes, QuotingStyle}, show_error, show_warning, }; use os_display::Quotable; -use std::{ - error::Error, - ffi::{OsStr, OsString}, - fmt::Display, -}; +use std::ffi::{OsStr, OsString}; /// An argument for formatting /// @@ -50,7 +47,7 @@ impl<'a, T: Iterator> ArgumentIter<'a> for T { }; match next { FormatArgument::Char(c) => Ok(*c as u8), - FormatArgument::Unparsed(os) => match try_get_bytes_from_os_str(os)?.first() { + FormatArgument::Unparsed(os) => match os_str_as_bytes_verbose(os)?.first() { Some(&byte) => Ok(byte), None => Ok(b'\0'), }, @@ -65,7 +62,7 @@ impl<'a, T: Iterator> ArgumentIter<'a> for T { match next { FormatArgument::UnsignedInt(n) => Ok(*n), FormatArgument::Unparsed(os) => { - let str = try_get_str_from_os_str(os)?; + let str = os_str_as_str_verbose(os)?; Ok(extract_value(ParsedNumber::parse_u64(str), str)) } @@ -80,7 +77,7 @@ impl<'a, T: Iterator> ArgumentIter<'a> for T { match next { FormatArgument::SignedInt(n) => Ok(*n), FormatArgument::Unparsed(os) => { - let str = try_get_str_from_os_str(os)?; + let str = os_str_as_str_verbose(os)?; Ok(extract_value(ParsedNumber::parse_i64(str), str)) } @@ -95,7 +92,7 @@ impl<'a, T: Iterator> ArgumentIter<'a> for T { match next { FormatArgument::Float(n) => Ok(*n), FormatArgument::Unparsed(os) => { - let str = try_get_str_from_os_str(os)?; + let str = os_str_as_str_verbose(os)?; Ok(extract_value(ParsedNumber::parse_f64(str), str)) } @@ -147,56 +144,3 @@ fn extract_value(p: Result>, input: &str) -> T } } } - -#[derive(Debug)] -pub struct NonUtf8OsStr(pub String); - -impl Display for NonUtf8OsStr { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_fmt(format_args!( - "invalid (non-UTF-8) string like {} encountered", - self.0.quote(), - )) - } -} - -impl Error for NonUtf8OsStr {} -impl UError for NonUtf8OsStr {} - -pub fn try_get_str_from_os_str(os_str: &OsStr) -> Result<&str, NonUtf8OsStr> { - match os_str.to_str() { - Some(st) => Ok(st), - None => { - let cow = os_str.to_string_lossy(); - - Err(NonUtf8OsStr(cow.to_string())) - } - } -} - -pub fn try_get_bytes_from_os_str(input: &OsStr) -> Result<&[u8], NonUtf8OsStr> { - let result = { - #[cfg(target_family = "unix")] - { - use std::os::unix::ffi::OsStrExt; - - Ok(input.as_bytes()) - } - - #[cfg(not(target_family = "unix"))] - { - // TODO - // Verify that this works correctly on these platforms - match input.to_str().map(|st| st.as_bytes()) { - Some(sl) => Ok(sl), - None => { - let cow = input.to_string_lossy(); - - Err(NonUtf8OsStr(cow.to_string())) - } - } - } - }; - - result -} diff --git a/src/uucore/src/lib/features/format/mod.rs b/src/uucore/src/lib/features/format/mod.rs index 2bc6db16bfe..7933cf8d2a6 100644 --- a/src/uucore/src/lib/features/format/mod.rs +++ b/src/uucore/src/lib/features/format/mod.rs @@ -37,8 +37,13 @@ pub mod num_format; pub mod num_parser; mod spec; -pub use argument::*; -use os_display::Quotable; +pub use argument::FormatArgument; + +use self::{ + escape::{parse_escape_code, EscapedChar}, + num_format::Formatter, +}; +use crate::{error::UError, NonUtf8OsStrError, OsStrConversionType}; use spec::Spec; use std::{ error::Error, @@ -47,13 +52,6 @@ use std::{ ops::ControlFlow, }; -use crate::error::UError; - -use self::{ - escape::{parse_escape_code, EscapedChar}, - num_format::Formatter, -}; - #[derive(Debug)] pub enum FormatError { SpecError(Vec), @@ -64,7 +62,7 @@ pub enum FormatError { NeedAtLeastOneSpec(Vec), WrongSpecType, InvalidPrecision(String), - InvalidEncoding(NonUtf8OsStr), + InvalidEncoding(NonUtf8OsStrError), } impl Error for FormatError {} @@ -76,8 +74,8 @@ impl From for FormatError { } } -impl From for FormatError { - fn from(value: NonUtf8OsStr) -> FormatError { +impl From for FormatError { + fn from(value: NonUtf8OsStrError) -> FormatError { FormatError::InvalidEncoding(value) } } @@ -107,11 +105,18 @@ impl Display for FormatError { Self::NoMoreArguments => write!(f, "no more arguments"), Self::InvalidArgument(_) => write!(f, "invalid argument"), Self::InvalidEncoding(no) => { - write!( - f, - "invalid (non-UTF-8) argument like {} encountered", - no.0.quote() - ) + use os_display::Quotable; + + let quoted = no.input_lossy_string.quote(); + + match no.conversion_type { + OsStrConversionType::ToBytes => f.write_fmt(format_args!( + "invalid (non-UTF-8) argument like {quoted} encountered when converting argument to bytes on a platform that doesn't use UTF-8", + )), + OsStrConversionType::ToString => f.write_fmt(format_args!( + "invalid (non-UTF-8) argument like {quoted} encountered", + )), + } } } } diff --git a/src/uucore/src/lib/features/format/spec.rs b/src/uucore/src/lib/features/format/spec.rs index fa8ab2ac196..89efc55a1e5 100644 --- a/src/uucore/src/lib/features/format/spec.rs +++ b/src/uucore/src/lib/features/format/spec.rs @@ -6,13 +6,17 @@ // spell-checker:ignore (vars) intmax ptrdiff padlen use super::{ + argument::ArgumentIter, num_format::{ self, Case, FloatVariant, ForceDecimal, Formatter, NumberAlignment, PositiveSign, Prefix, UnsignedIntVariant, }, - parse_escape_only, try_get_bytes_from_os_str, ArgumentIter, FormatChar, FormatError, + parse_escape_only, FormatChar, FormatError, +}; +use crate::{ + os_str_as_bytes_verbose, + quoting_style::{escape_name, QuotingStyle}, }; -use crate::quoting_style::{escape_name, QuotingStyle}; use std::{io::Write, ops::ControlFlow}; /// A parsed specification for formatting a value @@ -333,7 +337,7 @@ impl Spec { let os_str = args.get_str(); - let bytes = try_get_bytes_from_os_str(os_str).unwrap(); + let bytes = os_str_as_bytes_verbose(os_str)?; let truncated = match precision { Some(p) if p < os_str.len() => &bytes[..p], @@ -345,7 +349,7 @@ impl Spec { Self::EscapedString => { let os_str = args.get_str(); - let bytes = try_get_bytes_from_os_str(os_str).unwrap(); + let bytes = os_str_as_bytes_verbose(os_str)?; let mut parsed = Vec::::new(); diff --git a/src/uucore/src/lib/lib.rs b/src/uucore/src/lib/lib.rs index 6142e688d7c..2f1c1ddaa2d 100644 --- a/src/uucore/src/lib/lib.rs +++ b/src/uucore/src/lib/lib.rs @@ -253,24 +253,84 @@ pub fn read_yes() -> bool { } } +fn os_str_as_bytes_core(os_string: &OsStr) -> Option<&[u8]> { + // Using a block like this prevents accidental shadowing if the #[cfg] attributes are accidentally not mutually + // exclusive + let option = { + #[cfg(unix)] + { + Some(os_string.as_bytes()) + } + + #[cfg(not(unix))] + { + os_string.to_str().map(|op| op.as_bytes()) + } + }; + + option +} + /// Helper function for processing delimiter values (which could be non UTF-8) /// It converts OsString to &[u8] for unix targets only /// On non-unix (i.e. Windows) it will just return an error if delimiter value is not UTF-8 pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> { - #[cfg(unix)] - let bytes = os_string.as_bytes(); - - #[cfg(not(unix))] - let bytes = os_string - .to_str() - .ok_or_else(|| { - mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") - })? - .as_bytes(); + let bytes = os_str_as_bytes_core(os_string).ok_or_else(|| { + mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") + })?; Ok(bytes) } +#[derive(Debug)] +enum OsStrConversionType { + ToBytes, + ToString, +} + +#[derive(Debug)] +pub struct NonUtf8OsStrError { + conversion_type: OsStrConversionType, + input_lossy_string: String, +} + +impl std::fmt::Display for NonUtf8OsStrError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use os_display::Quotable; + + let quoted = self.input_lossy_string.quote(); + + match self.conversion_type { + OsStrConversionType::ToBytes => f.write_fmt(format_args!( + "invalid (non-UTF-8) input like {quoted} encountered when converting input to bytes on a platform that doesn't use UTF-8", + )), + OsStrConversionType::ToString => f.write_fmt(format_args!( + "invalid (non-UTF-8) string like {quoted} encountered", + )), + } + } +} + +impl std::error::Error for NonUtf8OsStrError {} +impl error::UError for NonUtf8OsStrError {} + +pub fn os_str_as_bytes_verbose(input: &OsStr) -> Result<&[u8], NonUtf8OsStrError> { + os_str_as_bytes_core(input).ok_or_else(|| NonUtf8OsStrError { + conversion_type: OsStrConversionType::ToBytes, + input_lossy_string: input.to_string_lossy().into_owned(), + }) +} + +pub fn os_str_as_str_verbose(os_str: &OsStr) -> Result<&str, NonUtf8OsStrError> { + match os_str.to_str() { + Some(st) => Ok(st), + None => Err(NonUtf8OsStrError { + conversion_type: OsStrConversionType::ToString, + input_lossy_string: os_str.to_string_lossy().into_owned(), + }), + } +} + /// Helper function for converting a slice of bytes into an &OsStr /// or OsString in non-unix targets. /// @@ -279,12 +339,21 @@ pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> { /// and thus undergo UTF-8 validation, making it fail if the stream contains /// non-UTF-8 characters. pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult> { - #[cfg(unix)] - let os_str = Cow::Borrowed(OsStr::from_bytes(bytes)); - #[cfg(not(unix))] - let os_str = Cow::Owned(OsString::from(str::from_utf8(bytes).map_err(|_| { - mods::error::UUsageError::new(1, "Unable to transform bytes into OsStr") - })?)); + // Using a block like this prevents accidental shadowing if the #[cfg] attributes are accidentally not mutually + // exclusive + let os_str = { + #[cfg(unix)] + { + Cow::Borrowed(OsStr::from_bytes(bytes)) + } + + #[cfg(not(unix))] + { + Cow::Owned(OsString::from(str::from_utf8(bytes).map_err(|_| { + mods::error::UUsageError::new(1, "Unable to transform bytes into OsStr") + })?)) + } + }; Ok(os_str) } @@ -293,12 +362,24 @@ pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult> { /// It converts `Vec` to `OsString` for unix targets only. /// On non-unix (i.e. Windows) it may fail if the bytes are not valid UTF-8 pub fn os_string_from_vec(vec: Vec) -> mods::error::UResult { - #[cfg(unix)] - let s = OsString::from_vec(vec); - #[cfg(not(unix))] - let s = OsString::from(String::from_utf8(vec).map_err(|_| { - mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") - })?); + // Using a block like this prevents accidental shadowing if the #[cfg] attributes are accidentally not mutually + // exclusive + let s = { + #[cfg(unix)] + { + OsString::from_vec(vec) + } + + #[cfg(not(unix))] + { + OsString::from(String::from_utf8(vec).map_err(|_| { + mods::error::UUsageError::new( + 1, + "invalid UTF-8 was detected in one or more arguments", + ) + })?) + } + }; Ok(s) }