diff --git a/.vscode/cspell.dictionaries/shell.wordlist.txt b/.vscode/cspell.dictionaries/shell.wordlist.txt index 95dea94a7cd..11ce341addf 100644 --- a/.vscode/cspell.dictionaries/shell.wordlist.txt +++ b/.vscode/cspell.dictionaries/shell.wordlist.txt @@ -25,6 +25,7 @@ sudoedit tcsh tzselect urandom +VARNAME wtmp zsh diff --git a/Cargo.lock b/Cargo.lock index 978c7fc3fae..616fec68ded 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -541,7 +541,7 @@ dependencies = [ "lazy_static", "proc-macro2", "regex", - "syn 2.0.23", + "syn 2.0.32", "unicode-xid", ] @@ -553,7 +553,7 @@ checksum = "3e1a2532e4ed4ea13031c13bc7bc0dbca4aae32df48e9d77f0d1e743179f2ea1" dependencies = [ "lazy_static", "proc-macro2", - "syn 2.0.23", + "syn 2.0.32", ] [[package]] @@ -568,7 +568,7 @@ dependencies = [ "lazy_static", "proc-macro2", "quote", - "syn 2.0.23", + "syn 2.0.32", ] [[package]] @@ -936,7 +936,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.23", + "syn 2.0.32", ] [[package]] @@ -1778,7 +1778,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.23", + "syn 2.0.32", "unicode-ident", ] @@ -1889,9 +1889,23 @@ checksum = "e25dfac463d778e353db5be2449d1cce89bd6fd23c9f1ea21310ce6e5a1b29c4" [[package]] name = "serde" -version = "1.0.147" +version = "1.0.193" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d193d69bae983fc11a79df82342761dfbf28a99fc8d203dca4c3c1b590948965" +checksum = "25dd9975e68d0cb5aa1120c288333fc98731bd1dd12f561e468ea4728c042b89" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.193" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.32", +] [[package]] name = "sha1" @@ -1927,9 +1941,9 @@ dependencies = [ [[package]] name = "shlex" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" +checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380" [[package]] name = "signal-hook" @@ -2007,6 +2021,12 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "strsim" version = "0.10.0" @@ -2026,9 +2046,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.23" +version = "2.0.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59fb7d6d8281a51045d62b8eb3a7d1ce347b76f312af50cd3dc0af39c87c1737" +checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2" dependencies = [ "proc-macro2", "quote", @@ -2419,8 +2439,10 @@ name = "uu_env" version = "0.0.23" dependencies = [ "clap", + "memchr", "nix", "rust-ini", + "static_assertions", "uucore", ] @@ -3283,7 +3305,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.23", + "syn 2.0.32", "wasm-bindgen-shared", ] @@ -3305,7 +3327,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.23", + "syn 2.0.32", "wasm-bindgen-backend", "wasm-bindgen-shared", ] diff --git a/src/uu/env/Cargo.toml b/src/uu/env/Cargo.toml index a1df3563b20..78452ca6ade 100644 --- a/src/uu/env/Cargo.toml +++ b/src/uu/env/Cargo.toml @@ -18,6 +18,8 @@ path = "src/env.rs" clap = { workspace = true } rust-ini = { workspace = true } uucore = { workspace = true, features = ["signals"] } +memchr = "2.7.1" +static_assertions = "1.1.0" [target.'cfg(unix)'.dependencies] nix = { workspace = true, features = ["signal"] } diff --git a/src/uu/env/src/env.rs b/src/uu/env/src/env.rs index 608357f5050..4596ad30d00 100644 --- a/src/uu/env/src/env.rs +++ b/src/uu/env/src/env.rs @@ -5,19 +5,27 @@ // spell-checker:ignore (ToDO) chdir execvp progname subcommand subcommands unsets setenv putenv spawnp SIGSEGV SIGBUS sigaction +pub mod parse_error; +pub mod raw_string_parser; +pub mod split_iterator; + use clap::{crate_name, crate_version, Arg, ArgAction, Command}; use ini::Ini; #[cfg(unix)] use nix::sys::signal::{raise, sigaction, SaFlags, SigAction, SigHandler, SigSet, Signal}; use std::borrow::Cow; use std::env; +use std::ffi::OsString; use std::io::{self, Write}; use std::iter::Iterator; +use std::ops::Deref; +#[cfg(unix)] +use std::os::unix::ffi::OsStrExt; #[cfg(unix)] use std::os::unix::process::ExitStatusExt; -use std::process; +use std::process::{self}; use uucore::display::Quotable; -use uucore::error::{UClapError, UResult, USimpleError, UUsageError}; +use uucore::error::{ExitCode, UError, UResult, USimpleError, UUsageError}; use uucore::line_ending::LineEnding; use uucore::{format_usage, help_about, help_section, help_usage, show_warning}; @@ -25,6 +33,8 @@ const ABOUT: &str = help_about!("env.md"); const USAGE: &str = help_usage!("env.md"); const AFTER_HELP: &str = help_section!("after help", "env.md"); +const ERROR_MSG_S_SHEBANG: &str = "use -[v]S to pass options in shebang lines"; + struct Options<'a> { ignore_env: bool, line_ending: LineEnding, @@ -97,23 +107,11 @@ fn load_config_file(opts: &mut Options) -> UResult<()> { Ok(()) } -#[cfg(not(windows))] -#[allow(clippy::ptr_arg)] -fn build_command<'a, 'b>(args: &'a Vec<&'b str>) -> (Cow<'b, str>, &'a [&'b str]) { +fn build_command<'a, 'b>(args: &'a [&'b str]) -> (Cow<'b, str>, &'a [&'b str]) { let progname = Cow::from(args[0]); (progname, &args[1..]) } -#[cfg(windows)] -fn build_command<'a, 'b>(args: &'a mut Vec<&'b str>) -> (Cow<'b, str>, &'a [&'b str]) { - args.insert(0, "/d/c"); - let progname = env::var("ComSpec") - .map(Cow::from) - .unwrap_or_else(|_| Cow::from("cmd")); - - (progname, &args[..]) -} - pub fn uu_app() -> Command { Command::new(crate_name!()) .version(crate_version!()) @@ -168,191 +166,362 @@ pub fn uu_app() -> Command { .action(ArgAction::Append) .help("remove variable from the environment"), ) + .arg( + Arg::new("debug") + .short('v') + .long("debug") + .action(ArgAction::SetTrue) + .help("print verbose information for each processing step"), + ) + .arg( + Arg::new("split-string") // split string handling is implemented directly, not using CLAP. But this entry here is needed for the help information output. + .short('S') + .long("split-string") + .value_name("S") + .action(ArgAction::Set) + .help("process and split S into separate arguments; used to pass multiple arguments on shebang lines") + ) .arg(Arg::new("vars").action(ArgAction::Append)) } -#[allow(clippy::cognitive_complexity)] -fn run_env(args: impl uucore::Args) -> UResult<()> { - let app = uu_app(); - let matches = app.try_get_matches_from(args).with_exit_code(125)?; - - let ignore_env = matches.get_flag("ignore-environment"); - let line_ending = LineEnding::from_zero_flag(matches.get_flag("null")); - let running_directory = matches.get_one::("chdir").map(|s| s.as_str()); - let files = match matches.get_many::("file") { - Some(v) => v.map(|s| s.as_str()).collect(), - None => Vec::with_capacity(0), - }; - let unsets = match matches.get_many::("unset") { - Some(v) => v.map(|s| s.as_str()).collect(), - None => Vec::with_capacity(0), - }; - - let mut opts = Options { - ignore_env, - line_ending, - running_directory, - files, - unsets, - sets: vec![], - program: vec![], - }; - - // change directory - if let Some(d) = opts.running_directory { - match env::set_current_dir(d) { - Ok(()) => d, - Err(error) => { - return Err(USimpleError::new( - 125, - format!("cannot change directory to \"{d}\": {error}"), - )); - } - }; +pub fn parse_args_from_str(text: &str) -> UResult> { + split_iterator::split(text).map_err(|e| match e { + parse_error::ParseError::BackslashCNotAllowedInDoubleQuotes { pos: _ } => { + USimpleError::new(125, "'\\c' must not appear in double-quoted -S string") + } + parse_error::ParseError::InvalidBackslashAtEndOfStringInMinusS { pos: _, quoting: _ } => { + USimpleError::new(125, "invalid backslash at end of string in -S") + } + parse_error::ParseError::InvalidSequenceBackslashXInMinusS { pos: _, c } => { + USimpleError::new(125, format!("invalid sequence '\\{}' in -S", c)) + } + parse_error::ParseError::MissingClosingQuote { pos: _, c: _ } => { + USimpleError::new(125, "no terminating quote in -S string") + } + parse_error::ParseError::ParsingOfVariableNameFailed { pos, msg } => { + USimpleError::new(125, format!("variable name issue (at {}): {}", pos, msg,)) + } + _ => USimpleError::new(125, format!("Error: {:?}", e)), + }) +} + +fn check_and_handle_string_args( + arg: &OsString, + prefix_to_test: &str, + all_args: &mut Vec, + do_debug_print_args: Option<&Vec>, +) -> UResult { + let arg_bytes; + #[cfg(unix)] + { + arg_bytes = arg.as_bytes(); + } + #[cfg(not(unix))] + { + arg_bytes = arg + .to_str() + .ok_or_else(|| USimpleError::new(1, "parameters contain invalid utf8"))? + .as_bytes(); } - let mut begin_prog_opts = false; - if let Some(mut iter) = matches.get_many::("vars") { - // read NAME=VALUE arguments (and up to a single program argument) - while !begin_prog_opts { - if let Some(opt) = iter.next() { - if opt == "-" { - opts.ignore_env = true; - } else { - begin_prog_opts = parse_name_value_opt(&mut opts, opt)?; + if !arg_bytes.starts_with(prefix_to_test.as_bytes()) { + return Ok(false); + } + + if let Some(input_args) = do_debug_print_args { + debug_print_args(input_args); // do it here, such that its also printed when we get an error/panic during parsing + } + + let remaining_bytes = arg_bytes.get(prefix_to_test.len()..).unwrap(); + let string = String::from_utf8(remaining_bytes.to_owned()).unwrap(); + + let arg_strings = parse_args_from_str(string.as_str())?; + all_args.extend(arg_strings.into_iter().map(OsString::from)); + + Ok(true) +} + +#[derive(Default)] +struct EnvAppData { + do_debug_printing: bool, + had_string_argument: bool, +} + +impl EnvAppData { + fn make_error_no_such_file_or_dir(&self, prog: &str) -> Box { + uucore::show_error!("'{}': No such file or directory", prog); + if !self.had_string_argument { + uucore::show_error!("{}", ERROR_MSG_S_SHEBANG); + } + ExitCode::new(127) + } + + fn process_all_string_arguments( + &mut self, + original_args: &Vec, + ) -> UResult> { + let mut all_args: Vec = Vec::new(); + for arg in original_args { + match arg { + b if check_and_handle_string_args(b, "--split-string", &mut all_args, None)? => { + self.had_string_argument = true; + } + b if check_and_handle_string_args(b, "-S", &mut all_args, None)? => { + self.had_string_argument = true; + } + b if check_and_handle_string_args( + b, + "-vS", + &mut all_args, + Some(original_args), + )? => + { + self.do_debug_printing = true; + self.had_string_argument = true; + } + _ => { + all_args.push(arg.clone()); } - } else { - break; } } - // read any leftover program arguments - for opt in iter { - parse_program_opt(&mut opts, opt)?; - } + Ok(all_args) } +} - // GNU env tests this behavior - if opts.program.is_empty() && running_directory.is_some() { - return Err(UUsageError::new( - 125, - "must specify command with --chdir (-C)".to_string(), - )); +fn debug_print_args(args: &[OsString]) { + eprintln!("input args:"); + for (i, arg) in args.iter().enumerate() { + eprintln!("arg[{}]: {}", i, arg.to_string_lossy()); } +} - // NOTE: we manually set and unset the env vars below rather than using Command::env() to more - // easily handle the case where no command is given +impl EnvAppData { + #[allow(clippy::cognitive_complexity)] + fn run_env(&mut self, original_args: impl uucore::Args) -> UResult<()> { + let original_args: Vec = original_args.collect(); + let args = self.process_all_string_arguments(&original_args)?; + + let app = uu_app(); + let matches = app + .try_get_matches_from(args) + .map_err(|e| -> Box { + match e.kind() { + clap::error::ErrorKind::DisplayHelp + | clap::error::ErrorKind::DisplayVersion => e.into(), + _ => { + // extent any real issue with parameter parsing by the ERROR_MSG_S_SHEBANG + let s = format!("{}", e); + if !s.is_empty() { + let s = s.trim_end(); + uucore::show_error!("{}", s); + } + uucore::show_error!("{}", ERROR_MSG_S_SHEBANG); + uucore::error::ExitCode::new(125) + } + } + })?; - // remove all env vars if told to ignore presets - if opts.ignore_env { - for (ref name, _) in env::vars() { - env::remove_var(name); + let did_debug_printing_before = self.do_debug_printing; // could have been done already as part of the "-vS" string parsing + let do_debug_printing = self.do_debug_printing || matches.get_flag("debug"); + if do_debug_printing && !did_debug_printing_before { + debug_print_args(&original_args); } - } - // load .env-style config file prior to those given on the command-line - load_config_file(&mut opts)?; + let ignore_env = matches.get_flag("ignore-environment"); + let line_ending = LineEnding::from_zero_flag(matches.get_flag("null")); + let running_directory = matches.get_one::("chdir").map(|s| s.as_str()); + let files = match matches.get_many::("file") { + Some(v) => v.map(|s| s.as_str()).collect(), + None => Vec::with_capacity(0), + }; + let unsets = match matches.get_many::("unset") { + Some(v) => v.map(|s| s.as_str()).collect(), + None => Vec::with_capacity(0), + }; + + let mut opts = Options { + ignore_env, + line_ending, + running_directory, + files, + unsets, + sets: vec![], + program: vec![], + }; + + // change directory + if let Some(d) = opts.running_directory { + match env::set_current_dir(d) { + Ok(()) => d, + Err(error) => { + return Err(USimpleError::new( + 125, + format!("cannot change directory to \"{d}\": {error}"), + )); + } + }; + } + + let mut begin_prog_opts = false; + if let Some(mut iter) = matches.get_many::("vars") { + // read NAME=VALUE arguments (and up to a single program argument) + while !begin_prog_opts { + if let Some(opt) = iter.next() { + if opt == "-" { + opts.ignore_env = true; + } else { + begin_prog_opts = parse_name_value_opt(&mut opts, opt)?; + } + } else { + break; + } + } + + // read any leftover program arguments + for opt in iter { + parse_program_opt(&mut opts, opt)?; + } + } - // unset specified env vars - for name in &opts.unsets { - if name.is_empty() || name.contains(0 as char) || name.contains('=') { - return Err(USimpleError::new( + // GNU env tests this behavior + if opts.program.is_empty() && running_directory.is_some() { + return Err(UUsageError::new( 125, - format!("cannot unset {}: Invalid argument", name.quote()), + "must specify command with --chdir (-C)".to_string(), )); } - env::remove_var(name); - } + // NOTE: we manually set and unset the env vars below rather than using Command::env() to more + // easily handle the case where no command is given - // set specified env vars - for &(name, val) in &opts.sets { - /* - * set_var panics if name is an empty string - * set_var internally calls setenv (on unix at least), while GNU env calls putenv instead. - * - * putenv returns successfully if provided with something like "=a" and modifies the environ - * variable to contain "=a" inside it, effectively modifying the process' current environment - * to contain a malformed string in it. Using GNU's implementation, the command `env =a` - * prints out the malformed string and even invokes the child process with that environment. - * This can be seen by using `env -i =a env` or `env -i =a cat /proc/self/environ` - * - * POSIX.1-2017 doesn't seem to mention what to do if the string is malformed (at least - * not in "Chapter 8, Environment Variables" or in the definition for environ and various - * exec*'s or in the description of env in the "Shell & Utilities" volume). - * - * It also doesn't specify any checks for putenv before modifying the environ variable, which - * is likely why glibc doesn't do so. However, the first set_var argument cannot point to - * an empty string or a string containing '='. - * - * There is no benefit in replicating GNU's env behavior, since it will only modify the - * environment in weird ways - */ - - if name.is_empty() { - show_warning!("no name specified for value {}", val.quote()); - continue; + // remove all env vars if told to ignore presets + if opts.ignore_env { + for (ref name, _) in env::vars() { + env::remove_var(name); + } } - env::set_var(name, val); - } - if opts.program.is_empty() { - // no program provided, so just dump all env vars to stdout - print_env(opts.line_ending); - } else { - // we need to execute a command - #[cfg(windows)] - let (prog, args) = build_command(&mut opts.program); - #[cfg(not(windows))] - let (prog, args) = build_command(&opts.program); - - /* - * On Unix-like systems Command::status either ends up calling either fork or posix_spawnp - * (which ends up calling clone). Keep using the current process would be ideal, but the - * standard library contains many checks and fail-safes to ensure the process ends up being - * created. This is much simpler than dealing with the hassles of calling execvp directly. - */ - match process::Command::new(&*prog).args(args).status() { - Ok(exit) if !exit.success() => { - #[cfg(unix)] - if let Some(exit_code) = exit.code() { - return Err(exit_code.into()); - } else { - // `exit.code()` returns `None` on Unix when the process is terminated by a signal. - // See std::os::unix::process::ExitStatusExt for more information. This prints out - // the interrupted process and the signal it received. - let signal_code = exit.signal().unwrap(); - let signal = Signal::try_from(signal_code).unwrap(); - - // We have to disable any handler that's installed by default. - // This ensures that we exit on this signal. - // For example, `SIGSEGV` and `SIGBUS` have default handlers installed in Rust. - // We ignore the errors because there is not much we can do if that fails anyway. - // SAFETY: The function is unsafe because installing functions is unsafe, but we are - // just defaulting to default behavior and not installing a function. Hence, the call - // is safe. - let _ = unsafe { - sigaction( - signal, - &SigAction::new(SigHandler::SigDfl, SaFlags::empty(), SigSet::all()), - ) - }; - - let _ = raise(signal); + // load .env-style config file prior to those given on the command-line + load_config_file(&mut opts)?; + + // unset specified env vars + for name in &opts.unsets { + if name.is_empty() || name.contains(0 as char) || name.contains('=') { + return Err(USimpleError::new( + 125, + format!("cannot unset {}: Invalid argument", name.quote()), + )); + } + + env::remove_var(name); + } + + // set specified env vars + for &(name, val) in &opts.sets { + /* + * set_var panics if name is an empty string + * set_var internally calls setenv (on unix at least), while GNU env calls putenv instead. + * + * putenv returns successfully if provided with something like "=a" and modifies the environ + * variable to contain "=a" inside it, effectively modifying the process' current environment + * to contain a malformed string in it. Using GNU's implementation, the command `env =a` + * prints out the malformed string and even invokes the child process with that environment. + * This can be seen by using `env -i =a env` or `env -i =a cat /proc/self/environ` + * + * POSIX.1-2017 doesn't seem to mention what to do if the string is malformed (at least + * not in "Chapter 8, Environment Variables" or in the definition for environ and various + * exec*'s or in the description of env in the "Shell & Utilities" volume). + * + * It also doesn't specify any checks for putenv before modifying the environ variable, which + * is likely why glibc doesn't do so. However, the first set_var argument cannot point to + * an empty string or a string containing '='. + * + * There is no benefit in replicating GNU's env behavior, since it will only modify the + * environment in weird ways + */ + + if name.is_empty() { + show_warning!("no name specified for value {}", val.quote()); + continue; + } + env::set_var(name, val); + } + + if opts.program.is_empty() { + // no program provided, so just dump all env vars to stdout + print_env(opts.line_ending); + } else { + // we need to execute a command + let (prog, args) = build_command(&opts.program); + + if do_debug_printing { + eprintln!("executable: {}", prog); + for (i, arg) in args.iter().enumerate() { + eprintln!("arg[{}]: {}", i, arg); } - #[cfg(not(unix))] - return Err(exit.code().unwrap().into()); } - Err(ref err) if err.kind() == io::ErrorKind::NotFound => return Err(127.into()), - Err(_) => return Err(126.into()), - Ok(_) => (), + + /* + * On Unix-like systems Command::status either ends up calling either fork or posix_spawnp + * (which ends up calling clone). Keep using the current process would be ideal, but the + * standard library contains many checks and fail-safes to ensure the process ends up being + * created. This is much simpler than dealing with the hassles of calling execvp directly. + */ + match process::Command::new(&*prog).args(args).status() { + Ok(exit) if !exit.success() => { + #[cfg(unix)] + if let Some(exit_code) = exit.code() { + return Err(exit_code.into()); + } else { + // `exit.code()` returns `None` on Unix when the process is terminated by a signal. + // See std::os::unix::process::ExitStatusExt for more information. This prints out + // the interrupted process and the signal it received. + let signal_code = exit.signal().unwrap(); + let signal = Signal::try_from(signal_code).unwrap(); + + // We have to disable any handler that's installed by default. + // This ensures that we exit on this signal. + // For example, `SIGSEGV` and `SIGBUS` have default handlers installed in Rust. + // We ignore the errors because there is not much we can do if that fails anyway. + // SAFETY: The function is unsafe because installing functions is unsafe, but we are + // just defaulting to default behavior and not installing a function. Hence, the call + // is safe. + let _ = unsafe { + sigaction( + signal, + &SigAction::new( + SigHandler::SigDfl, + SaFlags::empty(), + SigSet::all(), + ), + ) + }; + + let _ = raise(signal); + } + #[cfg(not(unix))] + return Err(exit.code().unwrap().into()); + } + Err(ref err) + if (err.kind() == io::ErrorKind::NotFound) + || (err.kind() == io::ErrorKind::InvalidInput) => + { + return Err(self.make_error_no_such_file_or_dir(prog.deref())); + } + Err(e) => { + eprintln!("unknown error: {:?}", e); + return Err(126.into()); + } + Ok(_) => (), + } } - } - Ok(()) + Ok(()) + } } #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { - run_env(args) + EnvAppData::default().run_env(args) } diff --git a/src/uu/env/src/parse_error.rs b/src/uu/env/src/parse_error.rs new file mode 100644 index 00000000000..0d06fb24386 --- /dev/null +++ b/src/uu/env/src/parse_error.rs @@ -0,0 +1,53 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use std::fmt; + +use crate::raw_string_parser; + +/// An error returned when string arg splitting fails. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ParseError { + MissingClosingQuote { + pos: usize, + c: char, + }, + InvalidBackslashAtEndOfStringInMinusS { + pos: usize, + quoting: String, + }, + BackslashCNotAllowedInDoubleQuotes { + pos: usize, + }, + InvalidSequenceBackslashXInMinusS { + pos: usize, + c: char, + }, + ParsingOfVariableNameFailed { + pos: usize, + msg: String, + }, + InternalError { + pos: usize, + sub_err: raw_string_parser::Error, + }, +} + +impl fmt::Display for ParseError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(format!("{:?}", self).as_str()) + } +} + +impl std::error::Error for ParseError {} + +impl From for ParseError { + fn from(value: raw_string_parser::Error) -> Self { + Self::InternalError { + pos: value.look_at_pos, + sub_err: value, + } + } +} diff --git a/src/uu/env/src/raw_string_parser.rs b/src/uu/env/src/raw_string_parser.rs new file mode 100644 index 00000000000..b500f60a0b9 --- /dev/null +++ b/src/uu/env/src/raw_string_parser.rs @@ -0,0 +1,307 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. +// +//! SAFETY: This module does "unsafe" byte by byte operations on a UTF8 encoded string. +//! UTF8 encodes all non-ASCII characters as multi-byte characters. Meaning that the UTF8 +//! string contains short sequences of bytes which should not be splitted or individually modified. +//! All bytes that belong to a multi-byte character sequence are defined to have a different value +//! than any ASCII single byte char. This can be used to easily detect where multi-byte character sequences +//! start and end. +//! To guarantee that after processing the output is again valid UTF8, the following rules must apply: +//! 1. Move multi-byte characters as a whole. +//! 2. Insert characters only on ASCII boundaries. +//! We also want to support even strings that contain partially invalid utf8. Thats why we can't rely +//! on std library functionality when dealing with multi-byte characters. +//! +//! The general idea of this module is to encapsulate the unsafe parts in a small and easily testable unit. +// spell-checker:ignore (words) splitted +#![allow(unsafe_code)] + +use std::mem; + +pub fn is_ascii(c: u8) -> bool { + (c & 0x80) == 0 +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Error { + pub look_at_pos: usize, + pub err_type: ErrorType, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ErrorType { + NoAsciiBoundary, + NoAsciiChar, + NoAsciiCharInput, + EndOfInput, + InternalError, +} + +pub struct RawStringParser<'a> { + pub input: &'a str, + pointer: usize, + pointer_str: &'a str, // just for debugging sessions. In release build it will be removed by the compiler. +} + +pub struct RawStringExpander<'a> { + parser: RawStringParser<'a>, + output: String, +} + +impl<'a> RawStringExpander<'a> { + pub fn new(input: &'a str) -> Self { + Self { + parser: RawStringParser::new(input), + output: String::default(), + } + } + + pub fn new_at(input: &'a str, pos: usize) -> Result { + Ok(Self { + parser: RawStringParser::new_at(input, pos)?, + output: String::default(), + }) + } + + pub fn get_parser(&self) -> &RawStringParser<'a> { + &self.parser + } + + pub fn get_parser_mut(&mut self) -> &mut RawStringParser<'a> { + &mut self.parser + } + + pub fn skip_one(&mut self) -> Result<(), Error> { + self.get_parser_mut().skip_one() + } + + pub fn get_look_at_pos(&self) -> usize { + self.get_parser().get_look_at_pos() + } + + pub fn take_one(&mut self) -> Result<(), Error> { + let parser = &mut self.parser; + let mut c = parser.look_at()?; + loop { + // SAFETY: Just moving any non-ASCII sequence as a whole is keeping multibyte chars intact. + // SAFETY: Additionally, the function 'take_collected_output' ensures that + // we only take the result when its end is at a ASCII boundary + unsafe { + self.output.as_mut_vec().push(c); + } + parser.set_pointer(parser.pointer + 1); + + if is_ascii(c) { + break; // stop at ASCII boundary + } + + if parser.pointer == parser.input.as_bytes().len() { + break; + } + + c = parser.look_at()?; + if is_ascii(c) { + break; // stop at ASCII boundary + } + } + + Ok(()) + } + + pub fn put_one_ascii(&mut self, c: u8) -> Result<(), Error> { + let parser = &self.parser; + if !is_ascii(c) { + return Err(parser.make_err(ErrorType::NoAsciiCharInput)); // SAFETY: only ASCII character are allowed to be pushed this way. + } + let boundary_detected = parser.detect_boundary()?; + if boundary_detected { + // SAFETY: when current look_at is ascii or the one before or we are at one of the two ends of the input, + // then we can't destroy a multi-byte-non-ascii char of input. + unsafe { + self.output.as_mut_vec().push(c); + } + Ok(()) + } else { + Err(parser.make_err(ErrorType::NoAsciiBoundary)) + } + } + + pub fn put_string_utf8(&mut self, str: &str) -> Result<(), Error> { + let parser = &self.parser; + let boundary_detected = parser.detect_boundary()?; + if boundary_detected { + // SAFETY: when current look_at is ascii or the one before or we are at one of the two ends of the input, + // then we can't destroy a multi-byte-non-ascii char of input. + self.output.push_str(str); + Ok(()) + } else { + Err(parser.make_err(ErrorType::NoAsciiBoundary)) + } + } + + pub fn take_collected_output(&mut self) -> Result { + let parser = &self.parser; + let boundary_detected = parser.detect_boundary()?; + if boundary_detected { + // SAFETY: when current look_at is ascii or the one before or we are at one of the two ends of the input, + // then we can't destroy a multi-byte-non-ascii char of input. + Ok(mem::take(&mut self.output)) + } else { + Err(parser.make_err(ErrorType::NoAsciiBoundary)) + } + } +} + +impl<'a> RawStringParser<'a> { + pub fn new(input: &'a str) -> Self { + Self { + input, + pointer: 0, + pointer_str: input, + } + } + + pub fn new_at(input: &'a str, pos: usize) -> Result { + let instance = Self { + input, + pointer: pos, + pointer_str: input, + }; + + if !instance.detect_boundary()? { + return Err(Error { + look_at_pos: instance.get_look_at_pos(), + err_type: ErrorType::NoAsciiBoundary, + }); + } + + Ok(instance) + } + + pub fn get_look_at_pos(&self) -> usize { + self.pointer + } + + pub fn look_at(&self) -> Result { + self.look_at_pointer(self.pointer) + } + + fn make_err(&self, err_type: ErrorType) -> Error { + Error { + look_at_pos: self.get_look_at_pos(), + err_type, + } + } + + pub fn look_at_pointer(&self, at_pointer: usize) -> Result { + let c = self.input.as_bytes().get(at_pointer); + if let Some(c) = c { + Ok(*c) + } else { + Err(self.make_err(ErrorType::EndOfInput)) + } + } + + pub fn skip_one(&mut self) -> Result<(), Error> { + let mut c = self.look_at()?; + loop { + // SAFETY: Just skipping any non-ASCII sequence as a whole is keeping multibyte chars intact. + // SAFETY: Additionally, the function 'take_collected_output' ensures that + // we only take the result when its end is at a ASCII boundary + self.set_pointer(self.pointer + 1); + + if is_ascii(c) { + break; // stop at ASCII boundary + } + + if self.pointer == self.input.as_bytes().len() { + break; + } + + c = self.look_at()?; + if is_ascii(c) { + break; // stop at ASCII boundary + } + } + + Ok(()) + } + + pub fn skip_multiple_ascii_bounded(&mut self, skip_byte_count: usize) -> Result<(), Error> { + let start_bounds = self.detect_boundary_at(self.pointer)?; + let end_ptr = self.pointer + skip_byte_count; + let end_bounds = self.detect_boundary_at(end_ptr)?; + if start_bounds && end_bounds { + self.set_pointer(end_ptr); + return Ok(()); + } + + Err(self.make_err(ErrorType::NoAsciiBoundary)) + } + + pub fn skip_until_ascii_char_or_end(&mut self, c: u8) -> Result<(), Error> { + if !is_ascii(c) { + return Err(self.make_err(ErrorType::NoAsciiCharInput)); + } + let boundary = self.detect_boundary()?; + if !boundary { + // SAFETY: moving away from within a multi-byte char is not allowed + return Err(self.make_err(ErrorType::NoAsciiBoundary)); + } + let remaining = self.input.as_bytes().get(self.pointer..); + if let Some(remaining_str) = remaining { + let pos = memchr::memchr(c, remaining_str); + if let Some(pos) = pos { + // SAFETY: new pointer position is on ASCII char + self.set_pointer(self.pointer + pos); + } else { + // SAFETY: setting pointer to the end should be valid as input is valid + self.set_pointer(self.input.len()); + } + return Ok(()); + } + Err(self.make_err(ErrorType::InternalError)) + } + + pub fn detect_boundary_at(&self, at_pointer: usize) -> Result { + let boundary_detected = (at_pointer == 0) + || (at_pointer == self.input.bytes().len()) + || is_ascii(self.look_at_pointer(at_pointer)?) + || is_ascii(self.look_at_pointer(at_pointer - 1)?); + Ok(boundary_detected) + } + + pub fn detect_boundary(&self) -> Result { + self.detect_boundary_at(self.pointer) + } + + pub fn get_substring(&self, range: &std::ops::Range) -> Result<&'a str, Error> { + let start_boundary = self.detect_boundary_at(range.start)?; + let end_boundary = self.detect_boundary_at(range.end)?; + if start_boundary && end_boundary { + Ok(self.input.get(range.start..range.end).unwrap()) + } else { + Err(self.make_err(ErrorType::NoAsciiBoundary)) + } + } + + pub fn look_at_remaining(&self) -> Result<&'a str, Error> { + let boundary_detected = self.detect_boundary()?; + if boundary_detected { + // SAFETY: when current look_at is ascii or the one before or we are at one of the two ends of the input, + // then we can't destroy a multi-byte-non-ascii char of input. + Ok(self.pointer_str) + } else { + Err(self.make_err(ErrorType::NoAsciiBoundary)) + } + } + + // UNSAFE -> private + fn set_pointer(&mut self, new_pointer: usize) { + self.pointer = new_pointer; + self.pointer_str = self.input.get(self.pointer..).unwrap_or("\u{FFFD}"); + } +} diff --git a/src/uu/env/src/split_iterator.rs b/src/uu/env/src/split_iterator.rs new file mode 100644 index 00000000000..29c9743570e --- /dev/null +++ b/src/uu/env/src/split_iterator.rs @@ -0,0 +1,505 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. +// +// This file is based on work from Tomasz Miąsko who published it as "shell_words" crate, +// licensed under the Apache License, Version 2.0 +// or the MIT license , at your option. +// +//! Process command line according to parsing rules of original GNU env. +//! Even though it looks quite like a POSIX syntax, the original +//! "shell_words" implementation had to be adapted significantly. +//! +//! Apart from the grammar differences, there is a new feature integrated: $VARIABLE expansion. +//! +//! [GNU env] +// spell-checker:ignore (words) Tomasz Miąsko rntfv FFFD varname + +#![forbid(unsafe_code)] + +use std::mem; +use std::ops::Range; + +use crate::parse_error::ParseError; +use crate::raw_string_parser::is_ascii; +use crate::raw_string_parser::RawStringExpander; +use crate::raw_string_parser::RawStringParser; + +#[derive(Clone, Copy)] +pub enum State { + /// Within a delimiter. + Delimiter, + /// After backslash, but before starting word. + DelimiterBackslash, + /// Within an unquoted word. + Unquoted, + /// After backslash in an unquoted word. + UnquotedBackslash, + /// Within a single quoted word. + SingleQuoted, + /// After backslash inside a double quoted word. + SingleQuotedBackslash, + /// Within a double quoted word. + DoubleQuoted, + /// After backslash inside a double quoted word. + DoubleQuotedBackslash, + /// Inside a comment. + Comment, +} + +const BACKSLASH: u8 = b'\\'; +const DOUBLE_QUOTES: u8 = b'\"'; +const SINGLE_QUOTES: u8 = b'\''; +const INVALID_UTF8_MARKER: char = '\u{FFFD}'; + +const REPLACEMENTS: (&[u8], &[u8]) = ("rntfv_#$\"".as_bytes(), "\r\n\t\x0C\x0B #$\"".as_bytes()); +static_assertions::const_assert_eq!(REPLACEMENTS.0.len(), REPLACEMENTS.1.len()); +const ASCII_WHITESPACE_CHARS: &[u8] = " \t\r\n\x0B\x0C".as_bytes(); + +pub struct SplitIterator<'a> { + pub raw_parser: RawStringExpander<'a>, + pub words: Vec, + pub state: State, +} + +impl<'a> SplitIterator<'a> { + pub fn new(s: &'a str) -> Self { + Self { + raw_parser: RawStringExpander::new(s), + words: Vec::::new(), + state: State::Delimiter, + } + } + + fn skip_one(&mut self) -> Result<(), ParseError> { + Ok(self.raw_parser.get_parser_mut().skip_one()?) + } + + fn take_one(&mut self) -> Result<(), ParseError> { + Ok(self.raw_parser.take_one()?) + } + + fn get_current_char(&self) -> Option { + self.raw_parser.get_parser().look_at().ok() + } + + fn push_ascii_char_to_word(&mut self, c: u8) -> Result<(), ParseError> { + Ok(self.raw_parser.put_one_ascii(c)?) + } + + fn push_word_to_words(&mut self) -> Result<(), ParseError> { + let word = self.raw_parser.take_collected_output()?; + self.words.push(word); + Ok(()) + } + + fn check_variable_name_start(&self) -> Result<(), ParseError> { + if let Some(c) = self.get_current_char() { + if c.is_ascii_digit() { + return Err(ParseError::ParsingOfVariableNameFailed { + pos: self.raw_parser.get_parser().get_look_at_pos(), + msg: format!("Unexpected character: '{}', expected variable name must not start with 0..9", c as char) }); + } + } + Ok(()) + } + + fn get_parser(&self) -> &RawStringParser<'a> { + self.raw_parser.get_parser() + } + + fn get_parser_mut(&mut self) -> &mut RawStringParser<'a> { + self.raw_parser.get_parser_mut() + } + + fn parse_braced_variable_name(&mut self) -> Result<(&'a str, Option<&'a str>), ParseError> { + let pos_start = self.get_parser().get_look_at_pos(); + + self.check_variable_name_start()?; + + let (varname_end, default_end); + loop { + match self.get_current_char() { + None => { + return Err(ParseError::ParsingOfVariableNameFailed { + pos: self.get_parser().get_look_at_pos(), msg: "Missing closing brace".into() }) + }, + Some(c) if !c.is_ascii() || c.is_ascii_alphanumeric() || c == b'_' => { + self.skip_one()?; + } + Some(b':') => { + varname_end = self.get_parser().get_look_at_pos(); + loop { + match self.get_current_char() { + None => { + return Err(ParseError::ParsingOfVariableNameFailed { + pos: self.get_parser().get_look_at_pos(), + msg: "Missing closing brace after default value".into() }) + }, + Some(b'}') => { + default_end = Some(self.get_parser().get_look_at_pos()); + self.skip_one()?; + break + }, + Some(_) => { + self.skip_one()?; + }, + } + } + break; + }, + Some(b'}') => { + varname_end = self.get_parser().get_look_at_pos(); + default_end = None; + self.skip_one()?; + break; + }, + Some(c) => { + return Err(ParseError::ParsingOfVariableNameFailed { + pos: self.get_parser().get_look_at_pos(), + msg: format!("Unexpected character: '{}', expected a closing brace ('}}') or colon (':')", c as char) + }) + }, + }; + } + + let default = if let Some(default_end) = default_end { + Some(self.get_parser().get_substring(&Range { + start: varname_end + 1, + end: default_end, + })?) + } else { + None + }; + + let varname = self.get_parser().get_substring(&Range { + start: pos_start, + end: varname_end, + })?; + + Ok((varname, default)) + } + + fn parse_unbraced_variable_name(&mut self) -> Result<&str, ParseError> { + let pos_start = self.get_parser().get_look_at_pos(); + + self.check_variable_name_start()?; + + loop { + match self.get_current_char() { + None => break, + Some(c) if c.is_ascii_alphanumeric() || c == b'_' => { + self.get_parser_mut().skip_one()?; + } + Some(_) => break, + }; + } + + let pos_end = self.get_parser().get_look_at_pos(); + + if pos_end == pos_start { + return Err(ParseError::ParsingOfVariableNameFailed { + pos: pos_start, + msg: "Missing variable name".into(), + }); + } + + Ok(self.get_parser().get_substring(&Range { + start: pos_start, + end: pos_end, + })?) + } + + fn substitute_variable(&mut self) -> Result<(), ParseError> { + self.get_parser_mut().skip_one()?; + + let (name, default) = match self.get_current_char() { + None => { + return Err(ParseError::ParsingOfVariableNameFailed { + pos: self.get_parser().get_look_at_pos(), + msg: "missing variable name".into(), + }) + } + Some(b'{') => { + self.skip_one()?; + self.parse_braced_variable_name()? + } + Some(_) => (self.parse_unbraced_variable_name()?, None), + }; + + let value = std::env::var(name).ok(); + match (&value, default) { + (None, None) => {} // do nothing, just replace it with "" + (Some(value), _) => { + self.raw_parser.put_string_utf8(value)?; + } + (None, Some(default)) => { + self.raw_parser.put_string_utf8(default)?; + } + }; + + Ok(()) + } + + fn check_and_replace_ascii_escape_code(&mut self, c: u8) -> Result { + let (from, to) = REPLACEMENTS; + if let Some(pos) = memchr::memchr(c, from) { + self.skip_one()?; + self.push_ascii_char_to_word(*to.get(pos).unwrap())?; + return Ok(true); + } + + Ok(false) + } + + fn make_invalid_sequence_backslash_xin_minus_s(&self, c: u8) -> ParseError { + let valid_char: char = if is_ascii(c) { + c.into() + } else { + INVALID_UTF8_MARKER + }; + ParseError::InvalidSequenceBackslashXInMinusS { + pos: self.raw_parser.get_parser().get_look_at_pos(), + c: valid_char, + } + } + + pub fn split(&mut self) -> Result, ParseError> { + use State::*; + + loop { + let c = self.get_current_char(); + let _c_char = + c.map(|c| -> char { char::from_u32(c.into()).unwrap_or(INVALID_UTF8_MARKER) }); // just for debugging session. In release, compiler will remove + self.state = match self.state { + Delimiter => match c { + None => break, + Some(SINGLE_QUOTES) => { + self.skip_one()?; + SingleQuoted + } + Some(DOUBLE_QUOTES) => { + self.skip_one()?; + DoubleQuoted + } + Some(BACKSLASH) => { + self.skip_one()?; + DelimiterBackslash + } + Some(c) if ASCII_WHITESPACE_CHARS.contains(&c) => { + self.skip_one()?; + Delimiter + } + Some(b'#') => { + self.skip_one()?; + Comment + } + Some(b'$') => { + self.substitute_variable()?; + Unquoted + } + Some(_) => { + self.take_one()?; + Unquoted + } + }, + DelimiterBackslash => match c { + None => { + return Err(ParseError::InvalidBackslashAtEndOfStringInMinusS { + pos: self.get_parser().get_look_at_pos(), + quoting: "Delimiter".into(), + }) + } + Some(b'_') => { + self.skip_one()?; + Delimiter + } + Some(b'\n') => { + self.skip_one()?; + Delimiter + } + Some(b'$') | Some(BACKSLASH) | Some(b'#') | Some(SINGLE_QUOTES) + | Some(DOUBLE_QUOTES) => { + self.take_one()?; + Unquoted + } + Some(b'c') => break, + Some(c) if self.check_and_replace_ascii_escape_code(c)? => Unquoted, + Some(c) => return Err(self.make_invalid_sequence_backslash_xin_minus_s(c)), + }, + Unquoted => match c { + None => { + self.push_word_to_words()?; + break; + } + Some(b'$') => { + self.substitute_variable()?; + self.state + } + Some(SINGLE_QUOTES) => { + self.skip_one()?; + SingleQuoted + } + Some(DOUBLE_QUOTES) => { + self.skip_one()?; + DoubleQuoted + } + Some(BACKSLASH) => { + self.skip_one()?; + UnquotedBackslash + } + Some(c) if ASCII_WHITESPACE_CHARS.contains(&c) => { + self.push_word_to_words()?; + self.skip_one()?; + Delimiter + } + Some(_) => { + self.take_one()?; + Unquoted + } + }, + UnquotedBackslash => match c { + None => { + return Err(ParseError::InvalidBackslashAtEndOfStringInMinusS { + pos: self.get_parser().get_look_at_pos(), + quoting: "Unquoted".into(), + }) + } + Some(b'\n') => { + self.skip_one()?; + Unquoted + } + Some(b'_') => { + self.skip_one()?; + self.push_word_to_words()?; + Delimiter + } + Some(b'c') => { + self.push_word_to_words()?; + break; + } + Some(b'$') | Some(BACKSLASH) | Some(SINGLE_QUOTES) | Some(DOUBLE_QUOTES) => { + self.take_one()?; + Unquoted + } + Some(c) if self.check_and_replace_ascii_escape_code(c)? => Unquoted, + Some(c) => return Err(self.make_invalid_sequence_backslash_xin_minus_s(c)), + }, + SingleQuoted => match c { + None => { + return Err(ParseError::MissingClosingQuote { + pos: self.get_parser().get_look_at_pos(), + c: '\'', + }) + } + Some(SINGLE_QUOTES) => { + self.skip_one()?; + Unquoted + } + Some(BACKSLASH) => { + self.skip_one()?; + SingleQuotedBackslash + } + Some(_) => { + self.take_one()?; + SingleQuoted + } + }, + SingleQuotedBackslash => match c { + None => { + return Err(ParseError::MissingClosingQuote { + pos: self.get_parser().get_look_at_pos(), + c: '\'', + }) + } + Some(b'\n') => { + self.skip_one()?; + SingleQuoted + } + Some(SINGLE_QUOTES) | Some(BACKSLASH) => { + self.take_one()?; + SingleQuoted + } + Some(c) if REPLACEMENTS.0.contains(&c) => { + // See GNU test-suite e11: In single quotes, \t remains as it is. + // Comparing with GNU behavior: \a is not accepted and issues an error. + // So apparently only known sequences are allowed, even though they are not expanded.... bug of GNU? + self.push_ascii_char_to_word(BACKSLASH)?; + self.take_one()?; + SingleQuoted + } + Some(c) => return Err(self.make_invalid_sequence_backslash_xin_minus_s(c)), + }, + DoubleQuoted => match c { + None => { + return Err(ParseError::MissingClosingQuote { + pos: self.get_parser().get_look_at_pos(), + c: '"', + }) + } + Some(b'$') => { + self.substitute_variable()?; + self.state + } + Some(DOUBLE_QUOTES) => { + self.skip_one()?; + Unquoted + } + Some(BACKSLASH) => { + self.skip_one()?; + DoubleQuotedBackslash + } + Some(_) => { + self.take_one()?; + DoubleQuoted + } + }, + DoubleQuotedBackslash => match c { + None => { + return Err(ParseError::MissingClosingQuote { + pos: self.get_parser().get_look_at_pos(), + c: '"', + }) + } + Some(b'\n') => { + self.skip_one()?; + DoubleQuoted + } + Some(DOUBLE_QUOTES) | Some(b'$') | Some(BACKSLASH) => { + self.take_one()?; + DoubleQuoted + } + Some(b'c') => { + return Err(ParseError::BackslashCNotAllowedInDoubleQuotes { + pos: self.get_parser().get_look_at_pos(), + }) + } + Some(c) if self.check_and_replace_ascii_escape_code(c)? => DoubleQuoted, + Some(c) => return Err(self.make_invalid_sequence_backslash_xin_minus_s(c)), + }, + Comment => match c { + None => break, + Some(b'\n') => { + self.skip_one()?; + Delimiter + } + Some(_) => { + self.get_parser_mut().skip_until_ascii_char_or_end(b'\n')?; + Comment + } + }, + }; + + if c.is_none() { + break; + } + } + + Ok(mem::take(&mut self.words)) + } +} + +pub fn split(s: &str) -> Result, ParseError> { + SplitIterator::new(s).split() +} diff --git a/tests/by-util/test_env.rs b/tests/by-util/test_env.rs index 8ce55a1d3a2..4fb6e19bb8a 100644 --- a/tests/by-util/test_env.rs +++ b/tests/by-util/test_env.rs @@ -2,8 +2,10 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore (words) bamf chdir rlimit prlimit COMSPEC +// spell-checker:ignore (words) bamf chdir rlimit prlimit COMSPEC cout cerr +#[cfg(any(target_os = "linux", target_os = "android"))] +use crate::common::util::expected_result; use crate::common::util::TestScenario; use std::env; use std::path::Path; @@ -34,11 +36,24 @@ fn test_env_version() { #[test] fn test_echo() { - let result = new_ucmd!().arg("echo").arg("FOO-bar").succeeds(); + #[cfg(target_os = "windows")] + let args = ["cmd", "/d/c", "echo"]; + #[cfg(not(target_os = "windows"))] + let args = ["echo"]; + + let result = new_ucmd!().args(&args).arg("FOO-bar").succeeds(); assert_eq!(result.stdout_str().trim(), "FOO-bar"); } +#[cfg(target_os = "windows")] +#[test] +fn test_if_windows_batch_files_can_be_executed() { + let result = new_ucmd!().arg("./runBat.bat").succeeds(); + + assert!(result.stdout_str().contains("Hello Windows World!")); +} + #[test] fn test_file_option() { let out = new_ucmd!() @@ -245,3 +260,834 @@ fn test_fail_change_directory() { .stderr_move_str(); assert!(out.contains("env: cannot change directory to ")); } + +fn modify_newlines_according_platform(input: &str) -> String { + #[cfg(target_os = "windows")] + { + input.replace("\n", "\r\n") + } + #[cfg(not(target_os = "windows"))] + { + input.into() + } +} + +#[cfg(not(target_os = "windows"))] // windows has no executable "echo", its only supported as part of a batch-file +#[test] +fn test_split_string_into_args_one_argument_no_quotes() { + let scene = TestScenario::new(util_name!()); + + let out = scene + .ucmd() + .arg("-S echo hello world") + .succeeds() + .stdout_move_str(); + assert_eq!(out, modify_newlines_according_platform("hello world\n")); +} + +#[cfg(not(target_os = "windows"))] // windows has no executable "echo", its only supported as part of a batch-file +#[test] +fn test_split_string_into_args_one_argument() { + let scene = TestScenario::new(util_name!()); + + let out = scene + .ucmd() + .arg("-S echo \"hello world\"") + .succeeds() + .stdout_move_str(); + assert_eq!(out, modify_newlines_according_platform("hello world\n")); +} + +#[cfg(not(target_os = "windows"))] // windows has no executable "echo", its only supported as part of a batch-file +#[test] +fn test_split_string_into_args_s_escaping_challenge() { + let scene = TestScenario::new(util_name!()); + + let out = scene + .ucmd() + .args(&[r#"-S echo "hello \"great\" world""#]) + .succeeds() + .stdout_move_str(); + assert_eq!( + out, + modify_newlines_according_platform("hello \"great\" world\n") + ); +} + +#[test] +fn test_split_string_into_args_s_escaped_c_not_allowed() { + let scene = TestScenario::new(util_name!()); + + let out = scene.ucmd().args(&[r#"-S"\c""#]).fails().stderr_move_str(); + assert_eq!( + out, + "env: '\\c' must not appear in double-quoted -S string\n" + ); +} + +#[cfg(not(target_os = "windows"))] // no printf available +#[test] +fn test_split_string_into_args_s_whitespace_handling() { + let scene = TestScenario::new(util_name!()); + + let out = scene + .ucmd() + .args(&["-Sprintf x%sx\\n A \t B \x0B\x0C\r\n"]) + .succeeds() + .stdout_move_str(); + assert_eq!(out, "xAx\nxBx\n"); +} + +#[cfg(not(target_os = "windows"))] // no printf available +#[test] +fn test_split_string_into_args_long_option_whitespace_handling() { + let scene = TestScenario::new(util_name!()); + + let out = scene + .ucmd() + .args(&["--split-string printf x%sx\\n A \t B \x0B\x0C\r\n"]) + .succeeds() + .stdout_move_str(); + assert_eq!(out, "xAx\nxBx\n"); +} + +#[cfg(not(target_os = "windows"))] // no printf available +#[test] +fn test_split_string_into_args_debug_output_whitespace_handling() { + let scene = TestScenario::new(util_name!()); + + let out = scene + .ucmd() + .args(&["-vS printf x%sx\\n A \t B \x0B\x0C\r\n"]) + .succeeds(); + assert_eq!(out.stdout_str(), "xAx\nxBx\n"); + assert_eq!(out.stderr_str(), "input args:\narg[0]: env\narg[1]: -vS printf x%sx\\n A \t B \u{b}\u{c}\r\n\nexecutable: printf\narg[0]: x%sx\n\narg[1]: A\narg[2]: B\n"); +} + +#[test] +fn test_split_string_misc() { + use ::env::parse_args_from_str; + + assert_eq!( + vec!["A=B", "FOO=AR", "sh", "-c", "echo $A$FOO"], + parse_args_from_str(r#"A=B FOO=AR sh -c "echo \$A\$FOO""#).unwrap(), + ); + assert_eq!( + vec!["A=B", "FOO=AR", "sh", "-c", "echo $A$FOO"], + parse_args_from_str(r#"A=B FOO=AR sh -c 'echo $A$FOO'"#).unwrap(), + ); + assert_eq!( + vec!["A=B", "FOO=AR", "sh", "-c", "echo $A$FOO"], + parse_args_from_str(r#"A=B FOO=AR sh -c 'echo $A$FOO'"#).unwrap(), + ); + + assert_eq!( + vec!["-i", "A=B ' C"], + parse_args_from_str(r#"-i A='B \' C'"#).unwrap(), + ); +} + +#[test] +fn test_split_string_environment_vars_test() { + std::env::set_var("FOO", "BAR"); + assert_eq!( + vec!["FOO=bar", "sh", "-c", "echo xBARx =$FOO="], + ::env::parse_args_from_str(r#"FOO=bar sh -c "echo x${FOO}x =\$FOO=""#).unwrap(), + ); +} + +#[macro_export] +macro_rules! compare_with_gnu { + ( $ts:expr, $args:expr ) => {{ + eprintln!("=========================================================================="); + let result = $ts.ucmd().args($args).run(); + + #[cfg(any(target_os = "linux", target_os = "android"))] + { + let reference = expected_result(&$ts, $args); + if let Ok(reference) = reference { + let success = result.code() == reference.code() + && result.stdout_str() == reference.stdout_str() + && result.stderr_str() == reference.stderr_str(); + if !success { + eprintln!("reference.code: {}", reference.code()); + eprintln!(" result.code: {}", result.code()); + eprintln!("reference.cout: {}", reference.stdout_str()); + eprintln!(" result.cout: {}", result.stdout_str()); + eprintln!("reference.cerr: {}", reference.stderr_str_lossy()); + eprintln!(" result.cerr: {}", result.stderr_str_lossy()); + } + assert_eq!(result.code(), reference.code()); + assert_eq!(result.stdout_str(), reference.stdout_str()); + assert_eq!(result.stderr_str(), reference.stderr_str()); + } else { + println!( + "gnu reference test skipped. Reason: {:?}", + reference.unwrap_err() + ); + } + } + + result + }}; +} + +#[test] +fn test_env_with_gnu_reference() { + let ts = TestScenario::new(util_name!()); + + compare_with_gnu!(ts, &["-S\\|echo hallo"]) // no quotes, invalid escape sequence | + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\|' in -S\n"); + + compare_with_gnu!(ts, &["-S\\a"]) // no quotes, invalid escape sequence a + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\a' in -S\n"); + + compare_with_gnu!(ts, &["-S\"\\a\""]) // double quotes, invalid escape sequence a + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\a' in -S\n"); + + compare_with_gnu!(ts, &[r#"-S"\a""#]) // same as before, just using r#""# + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\a' in -S\n"); + + compare_with_gnu!(ts, &["-S'\\a'"]) // single quotes, invalid escape sequence a + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\a' in -S\n"); + + compare_with_gnu!(ts, &[r#"-S\|\&\;"#]) // no quotes, invalid escape sequence | + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\|' in -S\n"); + + compare_with_gnu!(ts, &[r#"-S\<\&\;"#]) // no quotes, invalid escape sequence < + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\<' in -S\n"); + + compare_with_gnu!(ts, &[r#"-S\>\&\;"#]) // no quotes, invalid escape sequence > + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\>' in -S\n"); + + compare_with_gnu!(ts, &[r#"-S\`\&\;"#]) // no quotes, invalid escape sequence ` + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\`' in -S\n"); + + compare_with_gnu!(ts, &[r#"-S"\`\&\;""#]) // double quotes, invalid escape sequence ` + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\`' in -S\n"); + + compare_with_gnu!(ts, &[r#"-S'\`\&\;'"#]) // single quotes, invalid escape sequence ` + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\`' in -S\n"); + + ts.ucmd() + .args(&["-S\\'\\'"]) // empty single quotes, considered as program name + .fails() + .code_is(127) + .no_stdout() + .stderr_is("env: '''': No such file or directory\n"); // gnu version again adds escaping here + + compare_with_gnu!(ts, &["-S\\\"\\\""]) // empty double quotes, considered as program name + .failure() + .code_is(127) + .no_stdout() + .stderr_is("env: '\"\"': No such file or directory\n"); + + compare_with_gnu!(ts, &[r#"-S\`"#]) // ` escaped without quotes + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\`' in -S\n"); + + compare_with_gnu!(ts, &[r#"-S"\`""#]) // ` escaped in double quotes + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\`' in -S\n"); + + compare_with_gnu!(ts, &[r#"-S'\`'"#]) // ` escaped in single quotes + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\`' in -S\n"); + + compare_with_gnu!(ts, &[r#"-S\🦉"#]) // ` escaped in single quotes + .failure() + .code_is(125) + .no_stdout() + .stderr_is("env: invalid sequence '\\\u{FFFD}' in -S\n"); +} + +#[cfg(test)] +mod tests_split_iterator { + + enum EscapeStyle { + /// No escaping. + None, + /// Wrap in single quotes. + SingleQuoted, + /// Single quotes combined with backslash. + Mixed, + } + + /// Determines escaping style to use. + fn escape_style(s: &str) -> EscapeStyle { + if s.is_empty() { + return EscapeStyle::SingleQuoted; + } + + let mut special = false; + let mut newline = false; + let mut single_quote = false; + + for c in s.chars() { + match c { + '\n' => { + newline = true; + special = true; + } + '\'' => { + single_quote = true; + special = true; + } + '|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | ' ' | '\t' + | '*' | '?' | '[' | '#' | '˜' | '=' | '%' => { + special = true; + } + _ => continue, + } + } + + if !special { + EscapeStyle::None + } else if newline && !single_quote { + EscapeStyle::SingleQuoted + } else { + EscapeStyle::Mixed + } + } + + /// Escapes special characters in a string, so that it will retain its literal + /// meaning when used as a part of command in Unix shell. + /// + /// It tries to avoid introducing any unnecessary quotes or escape characters, + /// but specifics regarding quoting style are left unspecified. + pub fn quote(s: &str) -> std::borrow::Cow { + // We are going somewhat out of the way to provide + // minimal amount of quoting in typical cases. + match escape_style(s) { + EscapeStyle::None => s.into(), + EscapeStyle::SingleQuoted => format!("'{}'", s).into(), + EscapeStyle::Mixed => { + let mut quoted = String::new(); + quoted.push('\''); + for c in s.chars() { + if c == '\'' { + quoted.push_str("'\\''"); + } else { + quoted.push(c); + } + } + quoted.push('\''); + quoted.into() + } + } + } + + /// Joins arguments into a single command line suitable for execution in Unix + /// shell. + /// + /// Each argument is quoted using [`quote`] to preserve its literal meaning when + /// parsed by Unix shell. + /// + /// Note: This function is essentially an inverse of [`split`]. + /// + /// # Examples + /// + /// Logging executed commands in format that can be easily copied and pasted + /// into an actual shell: + /// + /// ```rust,no_run + /// fn execute(args: &[&str]) { + /// use std::process::Command; + /// println!("Executing: {}", shell_words::join(args)); + /// Command::new(&args[0]) + /// .args(&args[1..]) + /// .spawn() + /// .expect("failed to start subprocess") + /// .wait() + /// .expect("failed to wait for subprocess"); + /// } + /// + /// execute(&["python", "-c", "print('Hello world!')"]); + /// ``` + /// + /// [`quote`]: fn.quote.html + /// [`split`]: fn.split.html + pub fn join(words: I) -> String + where + I: IntoIterator, + S: AsRef, + { + let mut line = words.into_iter().fold(String::new(), |mut line, word| { + let quoted = quote(word.as_ref()); + line.push_str(quoted.as_ref()); + line.push(' '); + line + }); + line.pop(); + line + } + + use ::env::parse_error::ParseError; + use ::env::split_iterator::*; + + fn split_ok(cases: &[(&str, &[&str])]) { + for (i, &(input, expected)) in cases.iter().enumerate() { + match split(input) { + Err(actual) => { + panic!( + "[{i}] calling split({:?}):\nexpected: Ok({:?})\n actual: Err({:?})\n", + input, expected, actual + ); + } + Ok(actual) => { + assert!( + expected == actual.as_slice(), + "[{i}] After split({:?}).unwrap()\nexpected: {:?}\n actual: {:?}\n", + input, + expected, + actual + ); + } + } + } + } + + #[test] + fn split_empty() { + split_ok(&[("", &[])]); + } + + #[test] + fn split_initial_whitespace_is_removed() { + split_ok(&[ + (" a", &["a"]), + ("\t\t\t\tbar", &["bar"]), + ("\t \nc", &["c"]), + ]); + } + + #[test] + fn split_trailing_whitespace_is_removed() { + split_ok(&[ + ("a ", &["a"]), + ("b\t", &["b"]), + ("c\t \n \n \n", &["c"]), + ("d\n\n", &["d"]), + ]); + } + + #[test] + fn split_carriage_return() { + split_ok(&[("c\ra\r'\r'\r", &["c", "a", "\r"])]); + } + + #[test] + fn split_() { + split_ok(&[("\\'\\'", &["''"])]); + } + + #[test] + fn split_single_quotes() { + split_ok(&[ + (r#"''"#, &[r#""#]), + (r#"'a'"#, &[r#"a"#]), + (r#"'\\'"#, &[r#"\"#]), + (r#"' \\ '"#, &[r#" \ "#]), + (r#"'#'"#, &[r#"#"#]), + ]); + } + + #[test] + fn split_double_quotes() { + split_ok(&[ + (r#""""#, &[""]), + (r#""""""#, &[""]), + (r#""a b c' d""#, &["a b c' d"]), + (r#""\$""#, &["$"]), + (r#""`""#, &["`"]), + (r#""\"""#, &["\""]), + (r#""\\""#, &["\\"]), + ("\"\n\"", &["\n"]), + ("\"\\\n\"", &[""]), + ]); + } + + #[test] + fn split_unquoted() { + split_ok(&[ + (r#"\\|\\&\\;"#, &[r#"\|\&\;"#]), + (r#"\\<\\>"#, &[r#"\<\>"#]), + (r#"\\(\\)"#, &[r#"\(\)"#]), + (r#"\$"#, &[r#"$"#]), + (r#"\""#, &[r#"""#]), + (r#"\'"#, &[r#"'"#]), + ("\\\n", &[]), + (" \\\n \n", &[]), + ("a\nb\nc", &["a", "b", "c"]), + ("a\\\nb\\\nc", &["abc"]), + ("foo bar baz", &["foo", "bar", "baz"]), + ]); + } + + #[test] + fn split_trailing_backslash() { + assert_eq!( + split("\\"), + Err(ParseError::InvalidBackslashAtEndOfStringInMinusS { + pos: 1, + quoting: "Delimiter".into() + }) + ); + assert_eq!( + split(" \\"), + Err(ParseError::InvalidBackslashAtEndOfStringInMinusS { + pos: 2, + quoting: "Delimiter".into() + }) + ); + assert_eq!( + split("a\\"), + Err(ParseError::InvalidBackslashAtEndOfStringInMinusS { + pos: 2, + quoting: "Unquoted".into() + }) + ); + } + + #[test] + fn split_errors() { + assert_eq!( + split("'abc"), + Err(ParseError::MissingClosingQuote { pos: 4, c: '\'' }) + ); + assert_eq!( + split("\""), + Err(ParseError::MissingClosingQuote { pos: 1, c: '"' }) + ); + assert_eq!( + split("'\\"), + Err(ParseError::MissingClosingQuote { pos: 2, c: '\'' }) + ); + assert_eq!( + split("'\\"), + Err(ParseError::MissingClosingQuote { pos: 2, c: '\'' }) + ); + assert_eq!( + split(r#""$""#), + Err(ParseError::ParsingOfVariableNameFailed { + pos: 2, + msg: "Missing variable name".into() + }), + ); + } + + #[test] + fn split_error_fail_with_unknown_escape_sequences() { + assert_eq!( + split("\\a"), + Err(ParseError::InvalidSequenceBackslashXInMinusS { pos: 1, c: 'a' }) + ); + assert_eq!( + split("\"\\a\""), + Err(ParseError::InvalidSequenceBackslashXInMinusS { pos: 2, c: 'a' }) + ); + assert_eq!( + split("'\\a'"), + Err(ParseError::InvalidSequenceBackslashXInMinusS { pos: 2, c: 'a' }) + ); + assert_eq!( + split(r#""\a""#), + Err(ParseError::InvalidSequenceBackslashXInMinusS { pos: 2, c: 'a' }) + ); + assert_eq!( + split(r#"\🦉"#), + Err(ParseError::InvalidSequenceBackslashXInMinusS { + pos: 1, + c: '\u{FFFD}' + }) + ); + } + + #[test] + fn split_comments() { + split_ok(&[ + (r#" x # comment "#, &["x"]), + (r#" w1#w2 "#, &["w1#w2"]), + (r#"'not really a # comment'"#, &["not really a # comment"]), + (" a # very long comment \n b # another comment", &["a", "b"]), + ]); + } + + #[test] + fn test_quote() { + assert_eq!(quote(""), "''"); + assert_eq!(quote("'"), "''\\'''"); + assert_eq!(quote("abc"), "abc"); + assert_eq!(quote("a \n b"), "'a \n b'"); + assert_eq!(quote("X'\nY"), "'X'\\''\nY'"); + } + + #[test] + fn test_join() { + assert_eq!(join(["a", "b", "c"]), "a b c"); + assert_eq!(join([" ", "$", "\n"]), "' ' '$' '\n'"); + } + + #[test] + fn join_followed_by_split_is_identity() { + let cases: Vec<&[&str]> = vec![ + &["a"], + &["python", "-c", "print('Hello world!')"], + &["echo", " arg with spaces ", "arg \' with \" quotes"], + &["even newlines are quoted correctly\n", "\n", "\n\n\t "], + &["$", "`test`"], + &["cat", "~user/log*"], + &["test", "'a \"b", "\"X'"], + &["empty", "", "", ""], + ]; + for argv in cases { + let args = join(argv); + assert_eq!(split(&args).unwrap(), argv); + } + } +} + +mod test_raw_string_parser { + use env::raw_string_parser; + + #[test] + fn test_ascii_only_take_one_look_at_correct_data_and_end_behavior() { + let input = "hello"; + let mut uut = env::raw_string_parser::RawStringExpander::new(input); + for i in 0..input.len() { + assert_eq!( + input.as_bytes().get(i).unwrap(), + &uut.get_parser().look_at().unwrap() + ); + uut.take_one().unwrap(); + } + assert_eq!( + uut.get_parser().look_at(), + Err(raw_string_parser::Error { + look_at_pos: 5, + err_type: raw_string_parser::ErrorType::EndOfInput + }) + ); + uut.take_one().unwrap_err(); + uut.take_one().unwrap_err(); + assert_eq!(uut.take_collected_output().unwrap(), input); + uut.take_one().unwrap_err(); + assert_eq!(uut.take_collected_output().unwrap(), ""); + } + + #[test] + fn test_multi_byte_codes_take_one_look_at_correct_data_and_end_behavior() { + let input = "🦉🦉🦉x🦉🦉x🦉x🦉🦉🦉🦉"; + let first_byte_of_owl = *"🦉".as_bytes().first().unwrap(); + let mut uut = env::raw_string_parser::RawStringExpander::new(input); + for _i in 0..3 { + assert_eq!(uut.get_parser().look_at().unwrap(), first_byte_of_owl); + uut.take_one().unwrap(); + assert_eq!(uut.get_parser().look_at().unwrap(), b'x'); + uut.take_one().unwrap(); + } + assert_eq!(uut.get_parser().look_at().unwrap(), first_byte_of_owl); + uut.take_one().unwrap(); + assert_eq!( + uut.get_parser().look_at(), + Err(raw_string_parser::Error { + look_at_pos: 43, + err_type: raw_string_parser::ErrorType::EndOfInput + }) + ); + uut.take_one().unwrap_err(); + uut.take_one().unwrap_err(); + assert_eq!(uut.take_collected_output().unwrap(), input); + uut.take_one().unwrap_err(); + assert_eq!(uut.take_collected_output().unwrap(), ""); + } + + #[test] + fn test_multi_byte_codes_put_one_ascii_start_middle_end_try_invalid_ascii() { + let input = "🦉🦉🦉x🦉🦉x🦉x🦉🦉🦉🦉"; + let first_byte_of_owl = *"🦉".as_bytes().first().unwrap(); + let mut uut = env::raw_string_parser::RawStringExpander::new(input); + uut.put_one_ascii(b'a').unwrap(); + for _i in 0..3 { + assert_eq!(uut.get_parser().look_at().unwrap(), first_byte_of_owl); + uut.take_one().unwrap(); + uut.put_one_ascii(b'a').unwrap(); + assert_eq!(uut.get_parser().look_at().unwrap(), b'x'); + uut.take_one().unwrap(); + uut.put_one_ascii(b'a').unwrap(); + } + assert_eq!(uut.get_parser().look_at().unwrap(), first_byte_of_owl); + uut.take_one().unwrap(); + uut.put_one_ascii(first_byte_of_owl).unwrap_err(); + uut.put_one_ascii(b'a').unwrap(); + assert_eq!( + uut.get_parser().look_at(), + Err(raw_string_parser::Error { + look_at_pos: 43, + err_type: raw_string_parser::ErrorType::EndOfInput + }) + ); + uut.take_one().unwrap_err(); + uut.put_one_ascii(b'a').unwrap(); + uut.take_one().unwrap_err(); + assert_eq!( + uut.take_collected_output().unwrap(), + "a🦉🦉🦉axa🦉🦉axa🦉axa🦉🦉🦉🦉aa" + ); + uut.take_one().unwrap_err(); + assert_eq!(uut.take_collected_output().unwrap(), ""); + } + + #[test] + fn test_multi_byte_codes_skip_one_take_one_skip_until_ascii_char_or_end() { + let input = "🦉🦉🦉x🦉🦉x🦉x🦉🦉🦉🦉"; + let mut uut = env::raw_string_parser::RawStringExpander::new(input); + + uut.skip_one().unwrap(); // skip 🦉🦉🦉 + assert_eq!(uut.get_look_at_pos(), 12); + + uut.skip_one().unwrap(); // skip x + assert_eq!(uut.get_look_at_pos(), 13); + uut.take_one().unwrap(); // take 🦉🦉 + assert_eq!(uut.get_look_at_pos(), 21); + + uut.skip_one().unwrap(); // skip x + assert_eq!(uut.get_look_at_pos(), 22); + uut.get_parser_mut() + .skip_until_ascii_char_or_end(b'x') + .unwrap(); // skip 🦉 + assert_eq!(uut.get_look_at_pos(), 26); + uut.take_one().unwrap(); // take x + uut.get_parser_mut() + .skip_until_ascii_char_or_end(b'x') + .unwrap(); // skip 🦉🦉🦉🦉 till end + assert_eq!(uut.get_look_at_pos(), 43); + + uut.take_one().unwrap_err(); + assert_eq!(uut.take_collected_output().unwrap(), "🦉🦉x"); + } + + #[test] + fn test_multi_byte_codes_skip_multiple_ascii_bounded_good_and_bad() { + let input = "🦉🦉🦉x🦉🦉x🦉x🦉🦉🦉🦉"; + let mut uut = env::raw_string_parser::RawStringExpander::new(input); + + uut.get_parser_mut().skip_multiple_ascii_bounded(0).unwrap(); + assert_eq!(uut.get_look_at_pos(), 0); + for i in 1..12 { + uut.get_parser_mut() + .skip_multiple_ascii_bounded(i) + .unwrap_err(); + assert_eq!(uut.get_look_at_pos(), 0); + } + uut.get_parser_mut() + .skip_multiple_ascii_bounded(12) + .unwrap(); // skips 🦉🦉🦉 + assert_eq!(uut.get_look_at_pos(), 12); + + uut.take_one().unwrap(); // take x + assert_eq!(uut.get_look_at_pos(), 13); + uut.get_parser_mut() + .skip_multiple_ascii_bounded(12) + .unwrap_err(); + assert_eq!(uut.get_look_at_pos(), 13); + uut.get_parser_mut() + .skip_multiple_ascii_bounded(13) + .unwrap(); // skips 🦉🦉x🦉 + assert_eq!(uut.get_look_at_pos(), 26); + uut.take_one().unwrap(); // take x + + uut.get_parser_mut() + .skip_multiple_ascii_bounded(15) + .unwrap_err(); + assert_eq!(uut.get_look_at_pos(), 27); + uut.get_parser_mut() + .skip_multiple_ascii_bounded(17) + .unwrap_err(); + assert_eq!(uut.get_look_at_pos(), 27); + uut.get_parser_mut() + .skip_multiple_ascii_bounded(16) + .unwrap(); // skips 🦉🦉🦉🦉 + assert_eq!(uut.get_look_at_pos(), 43); + + uut.take_one().unwrap_err(); + assert_eq!(uut.take_collected_output().unwrap(), "xx"); + } + + #[test] + fn test_multi_byte_codes_put_string_utf8_start_middle_end() { + let input = "🦉🦉🦉x🦉🦉x🦉x🦉🦉🦉🦉"; + let mut uut = env::raw_string_parser::RawStringExpander::new(input); + + uut.put_string_utf8("🦔oo").unwrap(); + uut.take_one().unwrap(); // takes 🦉🦉🦉 + uut.put_string_utf8("oo🦔").unwrap(); + uut.take_one().unwrap(); // take x + uut.get_parser_mut() + .skip_until_ascii_char_or_end(b'\n') + .unwrap(); // skips till end + uut.put_string_utf8("o🦔o").unwrap(); + + uut.take_one().unwrap_err(); + assert_eq!(uut.take_collected_output().unwrap(), "🦔oo🦉🦉🦉oo🦔xo🦔o"); + } + + #[test] + fn test_multi_byte_codes_look_at_remaining_start_middle_end() { + let input = "🦉🦉🦉x🦉🦉x🦉x🦉🦉🦉🦉"; + let mut uut = env::raw_string_parser::RawStringExpander::new(input); + + assert_eq!(uut.get_parser().look_at_remaining().unwrap(), input); + uut.take_one().unwrap(); // takes 🦉🦉🦉 + assert_eq!( + uut.get_parser().look_at_remaining().unwrap(), + input.get(12..).unwrap() + ); + uut.get_parser_mut() + .skip_until_ascii_char_or_end(b'\n') + .unwrap(); // skips till end + assert_eq!(uut.get_parser().look_at_remaining().unwrap(), ""); + + uut.take_one().unwrap_err(); + assert_eq!(uut.take_collected_output().unwrap(), "🦉🦉🦉"); + } +} diff --git a/tests/common/util.rs b/tests/common/util.rs index 5dac61f7eac..1daebec01d8 100644 --- a/tests/common/util.rs +++ b/tests/common/util.rs @@ -12,7 +12,6 @@ use pretty_assertions::assert_eq; use rlimit::prlimit; #[cfg(feature = "sleep")] use rstest::rstest; -#[cfg(unix)] use std::borrow::Cow; use std::collections::VecDeque; #[cfg(not(windows))] @@ -344,6 +343,11 @@ impl CmdResult { std::str::from_utf8(&self.stderr).unwrap() } + /// Returns the program's standard error as a string slice, automatically handling invalid utf8 + pub fn stderr_str_lossy(&self) -> Cow<'_, str> { + String::from_utf8_lossy(&self.stderr) + } + /// Returns the program's standard error as a string /// consumes self pub fn stderr_move_str(self) -> String { @@ -2336,7 +2340,7 @@ pub fn whoami() -> String { pub fn host_name_for(util_name: &str) -> Cow { // In some environments, e.g. macOS/freebsd, the GNU coreutils are prefixed with "g" // to not interfere with the BSD counterparts already in `$PATH`. - #[cfg(not(target_os = "linux"))] + #[cfg(all(not(target_os = "linux"), not(target_os = "android")))] { // make call to `host_name_for` idempotent if util_name.starts_with('g') && util_name != "groups" { @@ -2479,7 +2483,7 @@ pub fn expected_result(ts: &TestScenario, args: &[&str]) -> std::result::Result< let (stdout, stderr): (String, String) = if cfg!(target_os = "linux") { ( result.stdout_str().to_string(), - result.stderr_str().to_string(), + result.stderr_str_lossy().to_string(), ) } else { // `host_name_for` added prefix, strip 'g' prefix from results: @@ -2487,7 +2491,7 @@ pub fn expected_result(ts: &TestScenario, args: &[&str]) -> std::result::Result< let to = &from[1..]; ( result.stdout_str().replace(&from, to), - result.stderr_str().replace(&from, to), + result.stderr_str_lossy().replace(&from, to), ) }; diff --git a/tests/fixtures/env/runBat.bat b/tests/fixtures/env/runBat.bat new file mode 100644 index 00000000000..63ab744d3ab --- /dev/null +++ b/tests/fixtures/env/runBat.bat @@ -0,0 +1 @@ +echo Hello Windows World! diff --git a/util/build-gnu.sh b/util/build-gnu.sh index 9fdb3079d9d..92a46b71e58 100755 --- a/util/build-gnu.sh +++ b/util/build-gnu.sh @@ -221,6 +221,8 @@ grep -rlE '/usr/local/bin/\s?/usr/local/bin' init.cfg tests/* | xargs -r sed -Ei # we should not regress our project just to match what GNU is going. # So, do some changes on the fly +patch -N -r - -d "$path_GNU" -p 1 -i "`realpath \"$path_UUTILS/util/gnu-patches/tests_env_env-S.pl.patch\"`" || true + sed -i -e "s|rm: cannot remove 'e/slink'|rm: cannot remove 'e'|g" tests/rm/fail-eacces.sh sed -i -e "s|rm: cannot remove 'a/b'|rm: cannot remove 'a'|g" tests/rm/fail-2eperm.sh diff --git a/util/gnu-patches/tests_env_env-S.pl.patch b/util/gnu-patches/tests_env_env-S.pl.patch new file mode 100644 index 00000000000..404a00ca60e --- /dev/null +++ b/util/gnu-patches/tests_env_env-S.pl.patch @@ -0,0 +1,47 @@ +diff --git a/tests/env/env-S.pl b/tests/env/env-S.pl +index 710ca82cf..af7cf6efa 100755 +--- a/tests/env/env-S.pl ++++ b/tests/env/env-S.pl +@@ -209,27 +209,28 @@ my @Tests = + {ERR=>"$prog: no terminating quote in -S string\n"}], + ['err5', q[-S'A=B\\q'], {EXIT=>125}, + {ERR=>"$prog: invalid sequence '\\q' in -S\n"}], +- ['err6', q[-S'A=$B'], {EXIT=>125}, +- {ERR=>"$prog: only \${VARNAME} expansion is supported, error at: \$B\n"}], ++ ['err6', q[-S'A=$B echo hello'], {EXIT=>0}, ++ {OUT=>"hello"}], + ['err7', q[-S'A=${B'], {EXIT=>125}, +- {ERR=>"$prog: only \${VARNAME} expansion is supported, " . +- "error at: \${B\n"}], ++ {ERR=>"$prog" . qq[: variable name issue (at 5): Missing closing brace\n]}], + ['err8', q[-S'A=${B%B}'], {EXIT=>125}, +- {ERR=>"$prog: only \${VARNAME} expansion is supported, " . +- "error at: \${B%B}\n"}], ++ {ERR=>"$prog" . qq[: variable name issue (at 5): Unexpected character: '%', expected a closing brace ('}') or colon (':')\n]}], + ['err9', q[-S'A=${9B}'], {EXIT=>125}, +- {ERR=>"$prog: only \${VARNAME} expansion is supported, " . +- "error at: \${9B}\n"}], ++ {ERR=>"$prog" . qq[: variable name issue (at 4): Unexpected character: '9', expected variable name must not start with 0..9\n]}], + + # Test incorrect shebang usage (extraneous whitespace). + ['err_sp2', q['-v -S cat -n'], {EXIT=>125}, +- {ERR=>"env: invalid option -- ' '\n" . +- "env: use -[v]S to pass options in shebang lines\n" . +- "Try 'env --help' for more information.\n"}], ++ {ERR=>"$prog: error: unexpected argument '- ' found\n\n" . ++ " tip: to pass '- ' as a value, use '-- - '\n\n" . ++ "Usage: $prog [OPTION]... [-] [NAME=VALUE]... [COMMAND [ARG]...]\n\n" . ++ "For more information, try '--help'.\n" . ++ "$prog: use -[v]S to pass options in shebang lines\n"}], + ['err_sp3', q['-v -S cat -n'], {EXIT=>125}, # embedded tab after -v +- {ERR=>"env: invalid option -- '\t'\n" . +- "env: use -[v]S to pass options in shebang lines\n" . +- "Try 'env --help' for more information.\n"}], ++ {ERR=>"$prog: error: unexpected argument '-\t' found\n\n" . ++ " tip: to pass '-\t' as a value, use '-- -\t'\n\n" . ++ "Usage: $prog [OPTION]... [-] [NAME=VALUE]... [COMMAND [ARG]...]\n\n" . ++ "For more information, try '--help'.\n" . ++ "$prog: use -[v]S to pass options in shebang lines\n"}], + + # Also diagnose incorrect shebang usage when failing to exec. + # This typically happens with: