Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

split: fix bug with large arguments to -C #7128

Merged
merged 1 commit into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
325 changes: 109 additions & 216 deletions src/uu/split/src/split.rs
Original file line number Diff line number Diff line change
Expand Up @@ -919,204 +919,6 @@ impl Write for LineChunkWriter<'_> {
}
}

/// Write lines to each sequential output files, limited by bytes.
///
/// This struct maintains an underlying writer representing the
/// current chunk of the output. On each call to [`write`], it writes
/// as many lines as possible to the current chunk without exceeding
/// the specified byte limit. If a single line has more bytes than the
/// limit, then fill an entire single chunk with those bytes and
/// handle the remainder of the line as if it were its own distinct
/// line. As many new underlying writers are created as needed to
/// write all the data in the input buffer.
struct LineBytesChunkWriter<'a> {
/// Parameters for creating the underlying writer for each new chunk.
settings: &'a Settings,

/// The maximum number of bytes allowed for a single chunk of output.
chunk_size: u64,

/// Running total of number of chunks that have been completed.
num_chunks_written: usize,

/// Remaining capacity in number of bytes in the current chunk.
///
/// This number starts at `chunk_size` and decreases as lines are
/// written. Once it reaches zero, a writer for a new chunk is
/// initialized and this number gets reset to `chunk_size`.
num_bytes_remaining_in_current_chunk: usize,

/// The underlying writer for the current chunk.
///
/// Once the number of bytes written to this writer exceeds
/// `chunk_size`, a new writer is initialized and assigned to this
/// field.
inner: BufWriter<Box<dyn Write>>,

/// Iterator that yields filenames for each chunk.
filename_iterator: FilenameIterator<'a>,
}

impl<'a> LineBytesChunkWriter<'a> {
fn new(chunk_size: u64, settings: &'a Settings) -> UResult<Self> {
let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix)?;
let filename = filename_iterator
.next()
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
if settings.verbose {
println!("creating file {}", filename.quote());
}
let inner = settings.instantiate_current_writer(&filename, true)?;
Ok(LineBytesChunkWriter {
settings,
chunk_size,
num_bytes_remaining_in_current_chunk: usize::try_from(chunk_size).unwrap(),
num_chunks_written: 0,
inner,
filename_iterator,
})
}
}

impl Write for LineBytesChunkWriter<'_> {
/// Write as many lines to a chunk as possible without
/// exceeding the byte limit. If a single line has more bytes
/// than the limit, then fill an entire single chunk with those
/// bytes and handle the remainder of the line as if it were
/// its own distinct line.
///
/// For example: if the `chunk_size` is 8 and the input is:
///
/// ```text
/// aaaaaaaaa\nbbbb\ncccc\ndd\nee\n
/// ```
///
/// then the output gets broken into chunks like this:
///
/// ```text
/// chunk 0 chunk 1 chunk 2 chunk 3
///
/// 0 1 2
/// 01234567 89 01234 56789 012 345 6
/// |------| |-------| |--------| |---|
/// aaaaaaaa a\nbbbb\n cccc\ndd\n ee\n
/// ```
///
/// Implements `--line-bytes=SIZE`
fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
// The total number of bytes written during the loop below.
//
// It is necessary to keep this running total because we may
// be making multiple calls to `write()` on multiple different
// underlying writers and we want the final reported number of
// bytes written to reflect the total number of bytes written
// to all of the underlying writers.
let mut total_bytes_written = 0;

// Loop until we have written all bytes in the input buffer
// (or an IO error occurs).
loop {
// If the buffer is empty, then we are done writing.
if buf.is_empty() {
return Ok(total_bytes_written);
}

// If we have filled the current chunk with bytes, then
// start a new chunk and initialize its corresponding
// writer.
if self.num_bytes_remaining_in_current_chunk == 0 {
self.num_chunks_written += 1;
let filename = self.filename_iterator.next().ok_or_else(|| {
std::io::Error::new(ErrorKind::Other, "output file suffixes exhausted")
})?;
if self.settings.verbose {
println!("creating file {}", filename.quote());
}
self.inner = self.settings.instantiate_current_writer(&filename, true)?;
self.num_bytes_remaining_in_current_chunk = self.chunk_size.try_into().unwrap();
}

// Find the first separator (default - newline character) in the buffer.
let sep = self.settings.separator;
match memchr::memchr(sep, buf) {
// If there is no separator character and the buffer is
// not empty, then write as many bytes as we can and
// then move on to the next chunk if necessary.
None => {
let end = self.num_bytes_remaining_in_current_chunk;

// This is ugly but here to match GNU behavior. If the input
// doesn't end with a separator, pretend that it does for handling
// the second to last segment chunk. See `line-bytes.sh`.
if end == buf.len()
&& self.num_bytes_remaining_in_current_chunk
< self.chunk_size.try_into().unwrap()
&& buf[buf.len() - 1] != sep
{
self.num_bytes_remaining_in_current_chunk = 0;
} else {
let num_bytes_written = custom_write(
&buf[..end.min(buf.len())],
&mut self.inner,
self.settings,
)?;
self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
total_bytes_written += num_bytes_written;
buf = &buf[num_bytes_written..];
}
}

// If there is a separator character and the line
// (including the separator character) will fit in the
// current chunk, then write the entire line and
// continue to the next iteration. (See chunk 1 in the
// example comment above.)
Some(i) if i < self.num_bytes_remaining_in_current_chunk => {
let num_bytes_written =
custom_write(&buf[..=i], &mut self.inner, self.settings)?;
self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
total_bytes_written += num_bytes_written;
buf = &buf[num_bytes_written..];
}

// If there is a separator character, the line
// (including the separator character) will not fit in
// the current chunk, *and* no other lines have been
// written to the current chunk, then write as many
// bytes as we can and continue to the next
// iteration. (See chunk 0 in the example comment
// above.)
Some(_)
if self.num_bytes_remaining_in_current_chunk
== self.chunk_size.try_into().unwrap() =>
{
let end = self.num_bytes_remaining_in_current_chunk;
let num_bytes_written =
custom_write(&buf[..end], &mut self.inner, self.settings)?;
self.num_bytes_remaining_in_current_chunk -= num_bytes_written;
total_bytes_written += num_bytes_written;
buf = &buf[num_bytes_written..];
}

// If there is a separator character, the line
// (including the separator character) will not fit in
// the current chunk, and at least one other line has
// been written to the current chunk, then signal to
// the next iteration that a new chunk needs to be
// created and continue to the next iteration of the
// loop to try writing the line there.
Some(_) => {
self.num_bytes_remaining_in_current_chunk = 0;
}
}
}
}

fn flush(&mut self) -> std::io::Result<()> {
self.inner.flush()
}
}

/// Output file parameters
struct OutFile {
filename: String,
Expand Down Expand Up @@ -1629,6 +1431,114 @@ where
Ok(())
}

/// Like `std::io::Lines`, but includes the line ending character.
///
/// This struct is generally created by calling `lines_with_sep` on a
/// reader.
pub struct LinesWithSep<R> {
inner: R,
separator: u8,
}

impl<R> Iterator for LinesWithSep<R>
where
R: BufRead,
{
type Item = std::io::Result<Vec<u8>>;

/// Read bytes from a buffer up to the requested number of lines.
fn next(&mut self) -> Option<Self::Item> {
let mut buf = vec![];
match self.inner.read_until(self.separator, &mut buf) {
Ok(0) => None,
Ok(_) => Some(Ok(buf)),
Err(e) => Some(Err(e)),
}
}
}

/// Like `std::str::lines` but includes the line ending character.
///
/// The `separator` defines the character to interpret as the line
/// ending. For the usual notion of "line", set this to `b'\n'`.
pub fn lines_with_sep<R>(reader: R, separator: u8) -> LinesWithSep<R>
where
R: BufRead,
{
LinesWithSep {
inner: reader,
separator,
}
}

fn line_bytes<R>(settings: &Settings, reader: &mut R, chunk_size: usize) -> UResult<()>
where
R: BufRead,
{
let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix)?;

// Initialize the writer just to satisfy the compiler. It is going
// to be overwritten for sure at the beginning of the loop below
// because we start with `remaining == 0`, indicating that a new
// chunk should start.
let mut writer: BufWriter<Box<dyn Write>> =
BufWriter::new(Box::new(std::io::Cursor::new(vec![])));

let mut remaining = 0;
for line in lines_with_sep(reader, settings.separator) {
let line = line?;
let mut line = &line[..];
loop {
if remaining == 0 {
let filename = filename_iterator
.next()
.ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?;
if settings.verbose {
println!("creating file {}", filename.quote());
}
writer = settings.instantiate_current_writer(&filename, true)?;
remaining = chunk_size;
}

// Special case: if this is the last line and it doesn't end
// with a newline character, then count its length as though
// it did end with a newline. If that puts it over the edge
// of this chunk, continue to the next chunk.
if line.len() == remaining
&& remaining < chunk_size
&& line[line.len() - 1] != settings.separator
{
remaining = 0;
continue;
}

// If the entire line fits in this chunk, write it and
// continue to the next line.
if line.len() <= remaining {
custom_write_all(line, &mut writer, settings)?;
remaining -= line.len();
break;
}

// If the line is too large to fit in *any* chunk and we are
// at the start of a new chunk, write as much as we can of
// it and pass the remainder along to the next chunk.
if line.len() > chunk_size && remaining == chunk_size {
custom_write_all(&line[..chunk_size], &mut writer, settings)?;
line = &line[chunk_size..];
remaining = 0;
continue;
}

// If the line is too large to fit in *this* chunk, but
// might otherwise fit in the next chunk, then just continue
// to the next chunk and let it be handled there.
remaining = 0;
}
}
Ok(())
}

#[allow(clippy::cognitive_complexity)]
fn split(settings: &Settings) -> UResult<()> {
let r_box = if settings.input == "-" {
Expand Down Expand Up @@ -1701,23 +1611,6 @@ fn split(settings: &Settings) -> UResult<()> {
},
}
}
Strategy::LineBytes(chunk_size) => {
let mut writer = LineBytesChunkWriter::new(chunk_size, settings)?;
match std::io::copy(&mut reader, &mut writer) {
Ok(_) => Ok(()),
Err(e) => match e.kind() {
// TODO Since the writer object controls the creation of
// new files, we need to rely on the `std::io::Result`
// returned by its `write()` method to communicate any
// errors to this calling scope. If a new file cannot be
// created because we have exceeded the number of
// allowable filenames, we use `ErrorKind::Other` to
// indicate that. A special error message needs to be
// printed in that case.
ErrorKind::Other => Err(USimpleError::new(1, format!("{e}"))),
_ => Err(uio_error!(e, "input/output error")),
},
}
}
Strategy::LineBytes(chunk_size) => line_bytes(settings, &mut reader, chunk_size as usize),
}
}
17 changes: 17 additions & 0 deletions tests/by-util/test_split.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1973,3 +1973,20 @@ fn test_split_separator_same_multiple() {
.args(&["-t:", "-t:", "-t,", "fivelines.txt"])
.fails();
}

#[test]
fn test_long_lines() {
let (at, mut ucmd) = at_and_ucmd!();
let line1 = format!("{:131070}\n", "");
let line2 = format!("{:1}\n", "");
let line3 = format!("{:131071}\n", "");
let infile = [line1, line2, line3].concat();
ucmd.args(&["-C", "131072"])
.pipe_in(infile)
.succeeds()
.no_output();
assert_eq!(at.read("xaa").len(), 131_071);
assert_eq!(at.read("xab").len(), 2);
assert_eq!(at.read("xac").len(), 131_072);
assert!(!at.plus("xad").exists());
}
Loading