diff --git a/src/uu/echo/src/echo.rs b/src/uu/echo/src/echo.rs index 746cdd7c59e..1e0b5e3de51 100644 --- a/src/uu/echo/src/echo.rs +++ b/src/uu/echo/src/echo.rs @@ -6,13 +6,13 @@ use clap::builder::ValueParser; use clap::parser::ValuesRef; use clap::{crate_version, Arg, ArgAction, Command}; -use std::ffi::{OsStr, OsString}; +use std::ffi::OsString; use std::io::{self, StdoutLock, Write}; use std::iter::Peekable; use std::ops::ControlFlow; use std::slice::Iter; -use uucore::error::{UResult, USimpleError}; -use uucore::{format_usage, help_about, help_section, help_usage}; +use uucore::error::UResult; +use uucore::{format_usage, help_about, help_section, help_usage, os_str_as_bytes_verbose}; const ABOUT: &str = help_about!("echo.md"); const USAGE: &str = help_usage!("echo.md"); @@ -355,13 +355,9 @@ fn execute( arguments_after_options: ValuesRef<'_, OsString>, ) -> UResult<()> { for (i, input) in arguments_after_options.enumerate() { - let Some(bytes) = bytes_from_os_string(input.as_os_str()) else { - return Err(USimpleError::new( - 1, - "Non-UTF-8 arguments provided, but this platform does not support them", - )); - }; + let bytes = os_str_as_bytes_verbose(input)?; + // Don't print a space before the first argument if i > 0 { stdout_lock.write_all(b" ")?; } @@ -381,23 +377,3 @@ fn execute( Ok(()) } - -fn bytes_from_os_string(input: &OsStr) -> Option<&[u8]> { - let option = { - #[cfg(target_family = "unix")] - { - use std::os::unix::ffi::OsStrExt; - - Some(input.as_bytes()) - } - - #[cfg(not(target_family = "unix"))] - { - // TODO - // Verify that this works correctly on these platforms - input.to_str().map(|st| st.as_bytes()) - } - }; - - option -} diff --git a/src/uu/printf/src/printf.rs b/src/uu/printf/src/printf.rs index f86b7bd9fa2..5bf8853a915 100644 --- a/src/uu/printf/src/printf.rs +++ b/src/uu/printf/src/printf.rs @@ -5,12 +5,14 @@ #![allow(dead_code)] +use clap::builder::ValueParser; use clap::{crate_version, Arg, ArgAction, Command}; +use std::ffi::OsString; use std::io::stdout; use std::ops::ControlFlow; use uucore::error::{UResult, UUsageError}; -use uucore::format::{parse_spec_and_escape, FormatArgument, FormatItem}; -use uucore::{format_usage, help_about, help_section, help_usage}; +use uucore::format::{parse_spec_and_escape, FormatArgument, FormatError, FormatItem}; +use uucore::{format_usage, help_about, help_section, help_usage, os_str_as_bytes_verbose}; const VERSION: &str = "version"; const HELP: &str = "help"; @@ -28,17 +30,22 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let matches = uu_app().get_matches_from(args); let format = matches - .get_one::(options::FORMAT) + .get_one::(options::FORMAT) .ok_or_else(|| UUsageError::new(1, "missing operand"))?; - let values: Vec<_> = match matches.get_many::(options::ARGUMENT) { - Some(s) => s.map(|s| FormatArgument::Unparsed(s.to_string())).collect(), - None => vec![], + let format_bytes = os_str_as_bytes_verbose(format).map_err(FormatError::from)?; + + let values = match matches.get_many::(options::ARGUMENT) { + Some(os_string) => os_string + .map(|os_string_ref| FormatArgument::Unparsed(os_string_ref.to_owned())) + .collect(), + None => Vec::::new(), }; let mut format_seen = false; let mut args = values.iter().peekable(); - for item in parse_spec_and_escape(format.as_ref()) { + + for item in parse_spec_and_escape(format_bytes) { if let Ok(FormatItem::Spec(_)) = item { format_seen = true; } @@ -55,7 +62,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { } while args.peek().is_some() { - for item in parse_spec_and_escape(format.as_ref()) { + for item in parse_spec_and_escape(format_bytes) { match item?.write(stdout(), &mut args)? { ControlFlow::Continue(()) => {} ControlFlow::Break(()) => return Ok(()), @@ -86,6 +93,10 @@ pub fn uu_app() -> Command { .help("Print version information") .action(ArgAction::Version), ) - .arg(Arg::new(options::FORMAT)) - .arg(Arg::new(options::ARGUMENT).action(ArgAction::Append)) + .arg(Arg::new(options::FORMAT).value_parser(ValueParser::os_string())) + .arg( + Arg::new(options::ARGUMENT) + .action(ArgAction::Append) + .value_parser(ValueParser::os_string()), + ) } diff --git a/src/uucore/src/lib/features/format/argument.rs b/src/uucore/src/lib/features/format/argument.rs index 75851049895..9811f9ad591 100644 --- a/src/uucore/src/lib/features/format/argument.rs +++ b/src/uucore/src/lib/features/format/argument.rs @@ -3,14 +3,16 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. +use super::FormatError; use crate::{ error::set_exit_code, features::format::num_parser::{ParseError, ParsedNumber}, + os_str_as_bytes_verbose, os_str_as_str_verbose, quoting_style::{escape_name, Quotes, QuotingStyle}, show_error, show_warning, }; use os_display::Quotable; -use std::ffi::OsStr; +use std::ffi::{OsStr, OsString}; /// An argument for formatting /// @@ -22,71 +24,86 @@ use std::ffi::OsStr; #[derive(Clone, Debug)] pub enum FormatArgument { Char(char), - String(String), + String(OsString), UnsignedInt(u64), SignedInt(i64), Float(f64), /// Special argument that gets coerced into the other variants - Unparsed(String), + Unparsed(OsString), } pub trait ArgumentIter<'a>: Iterator { - fn get_char(&mut self) -> u8; - fn get_i64(&mut self) -> i64; - fn get_u64(&mut self) -> u64; - fn get_f64(&mut self) -> f64; - fn get_str(&mut self) -> &'a str; + fn get_char(&mut self) -> Result; + fn get_i64(&mut self) -> Result; + fn get_u64(&mut self) -> Result; + fn get_f64(&mut self) -> Result; + fn get_str(&mut self) -> &'a OsStr; } impl<'a, T: Iterator> ArgumentIter<'a> for T { - fn get_char(&mut self) -> u8 { + fn get_char(&mut self) -> Result { let Some(next) = self.next() else { - return b'\0'; + return Ok(b'\0'); }; match next { - FormatArgument::Char(c) => *c as u8, - FormatArgument::Unparsed(s) => s.bytes().next().unwrap_or(b'\0'), - _ => b'\0', + FormatArgument::Char(c) => Ok(*c as u8), + FormatArgument::Unparsed(os) => match os_str_as_bytes_verbose(os)?.first() { + Some(&byte) => Ok(byte), + None => Ok(b'\0'), + }, + _ => Ok(b'\0'), } } - fn get_u64(&mut self) -> u64 { + fn get_u64(&mut self) -> Result { let Some(next) = self.next() else { - return 0; + return Ok(0); }; match next { - FormatArgument::UnsignedInt(n) => *n, - FormatArgument::Unparsed(s) => extract_value(ParsedNumber::parse_u64(s), s), - _ => 0, + FormatArgument::UnsignedInt(n) => Ok(*n), + FormatArgument::Unparsed(os) => { + let str = os_str_as_str_verbose(os)?; + + Ok(extract_value(ParsedNumber::parse_u64(str), str)) + } + _ => Ok(0), } } - fn get_i64(&mut self) -> i64 { + fn get_i64(&mut self) -> Result { let Some(next) = self.next() else { - return 0; + return Ok(0); }; match next { - FormatArgument::SignedInt(n) => *n, - FormatArgument::Unparsed(s) => extract_value(ParsedNumber::parse_i64(s), s), - _ => 0, + FormatArgument::SignedInt(n) => Ok(*n), + FormatArgument::Unparsed(os) => { + let str = os_str_as_str_verbose(os)?; + + Ok(extract_value(ParsedNumber::parse_i64(str), str)) + } + _ => Ok(0), } } - fn get_f64(&mut self) -> f64 { + fn get_f64(&mut self) -> Result { let Some(next) = self.next() else { - return 0.0; + return Ok(0.0); }; match next { - FormatArgument::Float(n) => *n, - FormatArgument::Unparsed(s) => extract_value(ParsedNumber::parse_f64(s), s), - _ => 0.0, + FormatArgument::Float(n) => Ok(*n), + FormatArgument::Unparsed(os) => { + let str = os_str_as_str_verbose(os)?; + + Ok(extract_value(ParsedNumber::parse_f64(str), str)) + } + _ => Ok(0.0), } } - fn get_str(&mut self) -> &'a str { + fn get_str(&mut self) -> &'a OsStr { match self.next() { - Some(FormatArgument::Unparsed(s) | FormatArgument::String(s)) => s, - _ => "", + Some(FormatArgument::Unparsed(os) | FormatArgument::String(os)) => os, + _ => "".as_ref(), } } } @@ -120,6 +137,7 @@ fn extract_value(p: Result>, input: &str) -> T } else { show_error!("{}: value not completely converted", input.quote()); } + v } } diff --git a/src/uucore/src/lib/features/format/mod.rs b/src/uucore/src/lib/features/format/mod.rs index 6d7a2ee3079..7933cf8d2a6 100644 --- a/src/uucore/src/lib/features/format/mod.rs +++ b/src/uucore/src/lib/features/format/mod.rs @@ -37,7 +37,13 @@ pub mod num_format; pub mod num_parser; mod spec; -pub use argument::*; +pub use argument::FormatArgument; + +use self::{ + escape::{parse_escape_code, EscapedChar}, + num_format::Formatter, +}; +use crate::{error::UError, NonUtf8OsStrError, OsStrConversionType}; use spec::Spec; use std::{ error::Error, @@ -46,13 +52,6 @@ use std::{ ops::ControlFlow, }; -use crate::error::UError; - -use self::{ - escape::{parse_escape_code, EscapedChar}, - num_format::Formatter, -}; - #[derive(Debug)] pub enum FormatError { SpecError(Vec), @@ -63,6 +62,7 @@ pub enum FormatError { NeedAtLeastOneSpec(Vec), WrongSpecType, InvalidPrecision(String), + InvalidEncoding(NonUtf8OsStrError), } impl Error for FormatError {} @@ -74,6 +74,12 @@ impl From for FormatError { } } +impl From for FormatError { + fn from(value: NonUtf8OsStrError) -> FormatError { + FormatError::InvalidEncoding(value) + } +} + impl Display for FormatError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { @@ -98,6 +104,20 @@ impl Display for FormatError { Self::IoError(_) => write!(f, "io error"), Self::NoMoreArguments => write!(f, "no more arguments"), Self::InvalidArgument(_) => write!(f, "invalid argument"), + Self::InvalidEncoding(no) => { + use os_display::Quotable; + + let quoted = no.input_lossy_string.quote(); + + match no.conversion_type { + OsStrConversionType::ToBytes => f.write_fmt(format_args!( + "invalid (non-UTF-8) argument like {quoted} encountered when converting argument to bytes on a platform that doesn't use UTF-8", + )), + OsStrConversionType::ToString => f.write_fmt(format_args!( + "invalid (non-UTF-8) argument like {quoted} encountered", + )), + } + } } } } diff --git a/src/uucore/src/lib/features/format/spec.rs b/src/uucore/src/lib/features/format/spec.rs index 581e1fa0624..89efc55a1e5 100644 --- a/src/uucore/src/lib/features/format/spec.rs +++ b/src/uucore/src/lib/features/format/spec.rs @@ -5,14 +5,17 @@ // spell-checker:ignore (vars) intmax ptrdiff padlen -use crate::quoting_style::{escape_name, QuotingStyle}; - use super::{ + argument::ArgumentIter, num_format::{ self, Case, FloatVariant, ForceDecimal, Formatter, NumberAlignment, PositiveSign, Prefix, UnsignedIntVariant, }, - parse_escape_only, ArgumentIter, FormatChar, FormatError, + parse_escape_only, FormatChar, FormatError, +}; +use crate::{ + os_str_as_bytes_verbose, + quoting_style::{escape_name, QuotingStyle}, }; use std::{io::Write, ops::ControlFlow}; @@ -315,7 +318,7 @@ impl Spec { match self { Self::Char { width, align_left } => { let width = resolve_asterisk(*width, &mut args)?.unwrap_or(0); - write_padded(writer, &[args.get_char()], width, *align_left) + write_padded(writer, &[args.get_char()?], width, *align_left) } Self::String { width, @@ -331,17 +334,26 @@ impl Spec { // TODO: We need to not use Rust's formatting for aligning the output, // so that we can just write bytes to stdout without panicking. let precision = resolve_asterisk(*precision, &mut args)?; - let s = args.get_str(); + + let os_str = args.get_str(); + + let bytes = os_str_as_bytes_verbose(os_str)?; + let truncated = match precision { - Some(p) if p < s.len() => &s[..p], - _ => s, + Some(p) if p < os_str.len() => &bytes[..p], + _ => bytes, }; - write_padded(writer, truncated.as_bytes(), width, *align_left) + + write_padded(writer, truncated, width, *align_left) } Self::EscapedString => { - let s = args.get_str(); - let mut parsed = Vec::new(); - for c in parse_escape_only(s.as_bytes()) { + let os_str = args.get_str(); + + let bytes = os_str_as_bytes_verbose(os_str)?; + + let mut parsed = Vec::::new(); + + for c in parse_escape_only(bytes) { match c.write(&mut parsed)? { ControlFlow::Continue(()) => {} ControlFlow::Break(()) => { @@ -353,11 +365,12 @@ impl Spec { writer.write_all(&parsed).map_err(FormatError::IoError) } Self::QuotedString => { - let s = args.get_str(); + let os = args.get_str(); + writer .write_all( escape_name( - s.as_ref(), + os, &QuotingStyle::Shell { escape: true, always_quote: false, @@ -376,7 +389,7 @@ impl Spec { } => { let width = resolve_asterisk(*width, &mut args)?.unwrap_or(0); let precision = resolve_asterisk(*precision, &mut args)?.unwrap_or(0); - let i = args.get_i64(); + let i = args.get_i64()?; if precision as u64 > i32::MAX as u64 { return Err(FormatError::InvalidPrecision(precision.to_string())); @@ -399,7 +412,7 @@ impl Spec { } => { let width = resolve_asterisk(*width, &mut args)?.unwrap_or(0); let precision = resolve_asterisk(*precision, &mut args)?.unwrap_or(0); - let i = args.get_u64(); + let i = args.get_u64()?; if precision as u64 > i32::MAX as u64 { return Err(FormatError::InvalidPrecision(precision.to_string())); @@ -425,7 +438,7 @@ impl Spec { } => { let width = resolve_asterisk(*width, &mut args)?.unwrap_or(0); let precision = resolve_asterisk(*precision, &mut args)?.unwrap_or(6); - let f = args.get_f64(); + let f = args.get_f64()?; if precision as u64 > i32::MAX as u64 { return Err(FormatError::InvalidPrecision(precision.to_string())); @@ -453,7 +466,7 @@ fn resolve_asterisk<'a>( ) -> Result, FormatError> { Ok(match option { None => None, - Some(CanAsterisk::Asterisk) => Some(usize::try_from(args.get_u64()).ok().unwrap_or(0)), + Some(CanAsterisk::Asterisk) => Some(usize::try_from(args.get_u64()?).ok().unwrap_or(0)), Some(CanAsterisk::Fixed(w)) => Some(w), }) } diff --git a/src/uucore/src/lib/lib.rs b/src/uucore/src/lib/lib.rs index 6142e688d7c..2f1c1ddaa2d 100644 --- a/src/uucore/src/lib/lib.rs +++ b/src/uucore/src/lib/lib.rs @@ -253,24 +253,84 @@ pub fn read_yes() -> bool { } } +fn os_str_as_bytes_core(os_string: &OsStr) -> Option<&[u8]> { + // Using a block like this prevents accidental shadowing if the #[cfg] attributes are accidentally not mutually + // exclusive + let option = { + #[cfg(unix)] + { + Some(os_string.as_bytes()) + } + + #[cfg(not(unix))] + { + os_string.to_str().map(|op| op.as_bytes()) + } + }; + + option +} + /// Helper function for processing delimiter values (which could be non UTF-8) /// It converts OsString to &[u8] for unix targets only /// On non-unix (i.e. Windows) it will just return an error if delimiter value is not UTF-8 pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> { - #[cfg(unix)] - let bytes = os_string.as_bytes(); - - #[cfg(not(unix))] - let bytes = os_string - .to_str() - .ok_or_else(|| { - mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") - })? - .as_bytes(); + let bytes = os_str_as_bytes_core(os_string).ok_or_else(|| { + mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") + })?; Ok(bytes) } +#[derive(Debug)] +enum OsStrConversionType { + ToBytes, + ToString, +} + +#[derive(Debug)] +pub struct NonUtf8OsStrError { + conversion_type: OsStrConversionType, + input_lossy_string: String, +} + +impl std::fmt::Display for NonUtf8OsStrError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use os_display::Quotable; + + let quoted = self.input_lossy_string.quote(); + + match self.conversion_type { + OsStrConversionType::ToBytes => f.write_fmt(format_args!( + "invalid (non-UTF-8) input like {quoted} encountered when converting input to bytes on a platform that doesn't use UTF-8", + )), + OsStrConversionType::ToString => f.write_fmt(format_args!( + "invalid (non-UTF-8) string like {quoted} encountered", + )), + } + } +} + +impl std::error::Error for NonUtf8OsStrError {} +impl error::UError for NonUtf8OsStrError {} + +pub fn os_str_as_bytes_verbose(input: &OsStr) -> Result<&[u8], NonUtf8OsStrError> { + os_str_as_bytes_core(input).ok_or_else(|| NonUtf8OsStrError { + conversion_type: OsStrConversionType::ToBytes, + input_lossy_string: input.to_string_lossy().into_owned(), + }) +} + +pub fn os_str_as_str_verbose(os_str: &OsStr) -> Result<&str, NonUtf8OsStrError> { + match os_str.to_str() { + Some(st) => Ok(st), + None => Err(NonUtf8OsStrError { + conversion_type: OsStrConversionType::ToString, + input_lossy_string: os_str.to_string_lossy().into_owned(), + }), + } +} + /// Helper function for converting a slice of bytes into an &OsStr /// or OsString in non-unix targets. /// @@ -279,12 +339,21 @@ pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> { /// and thus undergo UTF-8 validation, making it fail if the stream contains /// non-UTF-8 characters. pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult> { - #[cfg(unix)] - let os_str = Cow::Borrowed(OsStr::from_bytes(bytes)); - #[cfg(not(unix))] - let os_str = Cow::Owned(OsString::from(str::from_utf8(bytes).map_err(|_| { - mods::error::UUsageError::new(1, "Unable to transform bytes into OsStr") - })?)); + // Using a block like this prevents accidental shadowing if the #[cfg] attributes are accidentally not mutually + // exclusive + let os_str = { + #[cfg(unix)] + { + Cow::Borrowed(OsStr::from_bytes(bytes)) + } + + #[cfg(not(unix))] + { + Cow::Owned(OsString::from(str::from_utf8(bytes).map_err(|_| { + mods::error::UUsageError::new(1, "Unable to transform bytes into OsStr") + })?)) + } + }; Ok(os_str) } @@ -293,12 +362,24 @@ pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult> { /// It converts `Vec` to `OsString` for unix targets only. /// On non-unix (i.e. Windows) it may fail if the bytes are not valid UTF-8 pub fn os_string_from_vec(vec: Vec) -> mods::error::UResult { - #[cfg(unix)] - let s = OsString::from_vec(vec); - #[cfg(not(unix))] - let s = OsString::from(String::from_utf8(vec).map_err(|_| { - mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") - })?); + // Using a block like this prevents accidental shadowing if the #[cfg] attributes are accidentally not mutually + // exclusive + let s = { + #[cfg(unix)] + { + OsString::from_vec(vec) + } + + #[cfg(not(unix))] + { + OsString::from(String::from_utf8(vec).map_err(|_| { + mods::error::UUsageError::new( + 1, + "invalid UTF-8 was detected in one or more arguments", + ) + })?) + } + }; Ok(s) } diff --git a/tests/by-util/test_printf.rs b/tests/by-util/test_printf.rs index 29a8cc9140f..da35501f03e 100644 --- a/tests/by-util/test_printf.rs +++ b/tests/by-util/test_printf.rs @@ -916,3 +916,38 @@ fn float_flag_position_space_padding() { .succeeds() .stdout_only(" +1.0"); } + +#[test] +#[cfg(target_family = "unix")] +fn non_utf_8_input() { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + // ISO-8859-1 encoded text + // spell-checker:disable + const INPUT_AND_OUTPUT: &[u8] = + b"Swer an rehte g\xFCete wendet s\xEEn gem\xFCete, dem volget s\xE6lde und \xEAre."; + // spell-checker:enable + + let os_str = OsStr::from_bytes(INPUT_AND_OUTPUT); + + new_ucmd!() + .arg("%s") + .arg(os_str) + .succeeds() + .stdout_only_bytes(INPUT_AND_OUTPUT); + + new_ucmd!() + .arg(os_str) + .succeeds() + .stdout_only_bytes(INPUT_AND_OUTPUT); + + new_ucmd!() + .arg("%d") + .arg(os_str) + .fails() + // spell-checker:disable + .stderr_only("printf: invalid (non-UTF-8) argument like 'Swer an rehte g�ete wendet s�n gem�ete, dem volget s�lde und �re.' encountered +"); + // spell-checker:enable +}