diff --git a/Cargo.lock b/Cargo.lock index e283f0c5557..ddc1dc5b359 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -537,6 +537,7 @@ name = "coreutils" version = "0.5.0" dependencies = [ "bincode", + "bytecount", "chrono", "clap", "clap_complete", @@ -563,6 +564,7 @@ dependencies = [ "tempfile", "textwrap", "time", + "unicode-width 0.2.2", "unindent", "uu_arch", "uu_base32", diff --git a/Cargo.toml b/Cargo.toml index b388373a2aa..ac6763ca346 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -535,6 +535,7 @@ ctor.workspace = true filetime.workspace = true glob.workspace = true libc.workspace = true +bytecount.workspace = true num-prime.workspace = true pretty_assertions = "1.4.0" rand.workspace = true @@ -542,6 +543,7 @@ regex.workspace = true sha1 = { workspace = true, features = ["std"] } tempfile.workspace = true time = { workspace = true, features = ["local-offset"] } +unicode-width.workspace = true unindent = "0.2.3" uutests.workspace = true uucore = { workspace = true, features = [ diff --git a/src/uu/fold/src/fold.rs b/src/uu/fold/src/fold.rs index 2eb97933180..00b3a45cc77 100644 --- a/src/uu/fold/src/fold.rs +++ b/src/uu/fold/src/fold.rs @@ -19,6 +19,8 @@ const TAB_WIDTH: usize = 8; const NL: u8 = b'\n'; const CR: u8 = b'\r'; const TAB: u8 = b'\t'; +// Implementation threshold (8 KiB) to prevent unbounded buffer growth during streaming. +const STREAMING_FLUSH_THRESHOLD: usize = 8 * 1024; mod options { pub const BYTES: &str = "bytes"; @@ -288,6 +290,8 @@ fn compute_col_count(buffer: &[u8], mode: WidthMode) -> usize { } fn emit_output(ctx: &mut FoldContext<'_, W>) -> UResult<()> { + // Emit a folded line and keep the remaining buffer (if any) for the next line. + // When `-s` is active, we prefer breaking at the last recorded whitespace. let consume = match *ctx.last_space { Some(index) => index + 1, None => ctx.output.len(), @@ -322,6 +326,36 @@ fn emit_output(ctx: &mut FoldContext<'_, W>) -> UResult<()> { Ok(()) } +fn maybe_flush_unbroken_output(ctx: &mut FoldContext<'_, W>) -> UResult<()> { + // In streaming mode without `-s`, avoid unbounded buffering by periodically + // flushing long unbroken output segments. When `-s` is enabled we must keep + // the buffer to preserve the last whitespace boundary for folding. + if ctx.spaces || ctx.output.len() < STREAMING_FLUSH_THRESHOLD { + return Ok(()); + } + + if !ctx.output.is_empty() { + ctx.writer.write_all(ctx.output)?; + ctx.output.clear(); + } + Ok(()) +} + +fn push_byte(ctx: &mut FoldContext<'_, W>, byte: u8) -> UResult<()> { + // Append a single byte and flush if the buffer grows too large. + ctx.output.push(byte); + maybe_flush_unbroken_output(ctx) +} + +fn push_bytes(ctx: &mut FoldContext<'_, W>, bytes: &[u8]) -> UResult<()> { + // Append a byte slice and flush if the buffer grows too large. + if bytes.is_empty() { + return Ok(()); + } + ctx.output.extend_from_slice(bytes); + maybe_flush_unbroken_output(ctx) +} + fn process_ascii_line(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> { let mut idx = 0; let len = line.len(); @@ -331,15 +365,15 @@ fn process_ascii_line(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UR NL => { *ctx.last_space = None; emit_output(ctx)?; - break; + idx += 1; } CR => { - ctx.output.push(CR); + push_byte(ctx, CR)?; *ctx.col_count = 0; idx += 1; } 0x08 => { - ctx.output.push(0x08); + push_byte(ctx, 0x08)?; *ctx.col_count = ctx.col_count.saturating_sub(1); idx += 1; } @@ -358,16 +392,23 @@ fn process_ascii_line(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UR } else { *ctx.last_space = None; } - ctx.output.push(TAB); + push_byte(ctx, TAB)?; idx += 1; } 0x00..=0x07 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F => { - ctx.output.push(line[idx]); + push_byte(ctx, line[idx])?; if ctx.spaces && line[idx].is_ascii_whitespace() && line[idx] != CR { *ctx.last_space = Some(ctx.output.len() - 1); } else if !ctx.spaces { *ctx.last_space = None; } + + if ctx.mode == WidthMode::Characters { + *ctx.col_count = ctx.col_count.saturating_add(1); + if *ctx.col_count >= ctx.width { + emit_output(ctx)?; + } + } idx += 1; } _ => { @@ -405,7 +446,7 @@ fn push_ascii_segment(segment: &[u8], ctx: &mut FoldContext<'_, W>) -> let take = remaining.len().min(available); let base_len = ctx.output.len(); - ctx.output.extend_from_slice(&remaining[..take]); + push_bytes(ctx, &remaining[..take])?; *ctx.col_count += take; if ctx.spaces { @@ -430,16 +471,26 @@ fn process_utf8_line(line: &str, ctx: &mut FoldContext<'_, W>) -> URes return process_ascii_line(line.as_bytes(), ctx); } + process_utf8_chars(line, ctx) +} + +fn process_utf8_chars(line: &str, ctx: &mut FoldContext<'_, W>) -> UResult<()> { let line_bytes = line.as_bytes(); let mut iter = line.char_indices().peekable(); while let Some((byte_idx, ch)) = iter.next() { - // Include combining characters with the base character - while let Some(&(_, next_ch)) = iter.peek() { - if unicode_width::UnicodeWidthChar::width(next_ch).unwrap_or(1) == 0 { - iter.next(); - } else { - break; + // Include combining characters with the base character when we are + // measuring by display columns. In character-counting mode every + // scalar value must advance the counter to match `chars().count()` + // semantics (see `fold_characters_reference` in the tests), so we do + // not coalesce zero-width scalars there. + if ctx.mode == WidthMode::Columns { + while let Some(&(_, next_ch)) = iter.peek() { + if unicode_width::UnicodeWidthChar::width(next_ch).unwrap_or(1) == 0 { + iter.next(); + } else { + break; + } } } @@ -448,7 +499,7 @@ fn process_utf8_line(line: &str, ctx: &mut FoldContext<'_, W>) -> URes if ch == '\n' { *ctx.last_space = None; emit_output(ctx)?; - break; + continue; } if *ctx.col_count >= ctx.width { @@ -456,15 +507,13 @@ fn process_utf8_line(line: &str, ctx: &mut FoldContext<'_, W>) -> URes } if ch == '\r' { - ctx.output - .extend_from_slice(&line_bytes[byte_idx..next_idx]); + push_bytes(ctx, &line_bytes[byte_idx..next_idx])?; *ctx.col_count = 0; continue; } if ch == '\x08' { - ctx.output - .extend_from_slice(&line_bytes[byte_idx..next_idx]); + push_bytes(ctx, &line_bytes[byte_idx..next_idx])?; *ctx.col_count = ctx.col_count.saturating_sub(1); continue; } @@ -484,8 +533,7 @@ fn process_utf8_line(line: &str, ctx: &mut FoldContext<'_, W>) -> URes } else { *ctx.last_space = None; } - ctx.output - .extend_from_slice(&line_bytes[byte_idx..next_idx]); + push_bytes(ctx, &line_bytes[byte_idx..next_idx])?; continue; } @@ -506,8 +554,7 @@ fn process_utf8_line(line: &str, ctx: &mut FoldContext<'_, W>) -> URes *ctx.last_space = Some(ctx.output.len()); } - ctx.output - .extend_from_slice(&line_bytes[byte_idx..next_idx]); + push_bytes(ctx, &line_bytes[byte_idx..next_idx])?; *ctx.col_count = ctx.col_count.saturating_add(added); } @@ -519,7 +566,7 @@ fn process_non_utf8_line(line: &[u8], ctx: &mut FoldContext<'_, W>) -> if byte == NL { *ctx.last_space = None; emit_output(ctx)?; - break; + continue; } if *ctx.col_count >= ctx.width { @@ -539,7 +586,7 @@ fn process_non_utf8_line(line: &[u8], ctx: &mut FoldContext<'_, W>) -> } else { None }; - ctx.output.push(byte); + push_byte(ctx, byte)?; continue; } 0x08 => *ctx.col_count = ctx.col_count.saturating_sub(1), @@ -550,7 +597,46 @@ fn process_non_utf8_line(line: &[u8], ctx: &mut FoldContext<'_, W>) -> _ => *ctx.col_count = ctx.col_count.saturating_add(1), } - ctx.output.push(byte); + push_byte(ctx, byte)?; + } + + Ok(()) +} + +/// Process buffered bytes, emitting output for valid UTF-8 prefixes and +/// deferring incomplete sequences until more input arrives. +/// +/// If the buffer contains invalid UTF-8, it is handled in non-UTF-8 mode and +/// the buffer is fully consumed. +fn process_pending_chunk( + pending: &mut Vec, + ctx: &mut FoldContext<'_, W>, +) -> UResult<()> { + while !pending.is_empty() { + match std::str::from_utf8(pending) { + Ok(valid) => { + process_utf8_line(valid, ctx)?; + pending.clear(); + break; + } + Err(err) => { + if err.error_len().is_some() { + let res = process_non_utf8_line(pending, ctx); + pending.clear(); + res?; + break; + } + + let valid_up_to = err.valid_up_to(); + if valid_up_to == 0 { + break; + } + + let valid = std::str::from_utf8(&pending[..valid_up_to]).expect("valid prefix"); + process_utf8_line(valid, ctx)?; + pending.drain(..valid_up_to); + } + } } Ok(()) @@ -572,20 +658,12 @@ fn fold_file( mode: WidthMode, writer: &mut W, ) -> UResult<()> { - let mut line = Vec::new(); let mut output = Vec::new(); let mut col_count = 0; let mut last_space = None; + let mut pending = Vec::new(); - loop { - if file - .read_until(NL, &mut line) - .map_err_context(|| translate!("fold-error-readline"))? - == 0 - { - break; - } - + { let mut ctx = FoldContext { spaces, width, @@ -596,17 +674,32 @@ fn fold_file( last_space: &mut last_space, }; - match std::str::from_utf8(&line) { - Ok(s) => process_utf8_line(s, &mut ctx)?, - Err(_) => process_non_utf8_line(&line, &mut ctx)?, + loop { + let buffer = file + .fill_buf() + .map_err_context(|| translate!("fold-error-readline"))?; + if buffer.is_empty() { + break; + } + pending.extend_from_slice(buffer); + let consumed = buffer.len(); + file.consume(consumed); + + process_pending_chunk(&mut pending, &mut ctx)?; } - line.clear(); - } + if !pending.is_empty() { + match std::str::from_utf8(&pending) { + Ok(s) => process_utf8_line(s, &mut ctx)?, + Err(_) => process_non_utf8_line(&pending, &mut ctx)?, + } + pending.clear(); + } - if !output.is_empty() { - writer.write_all(&output)?; - output.clear(); + if !ctx.output.is_empty() { + ctx.writer.write_all(ctx.output)?; + ctx.output.clear(); + } } Ok(()) diff --git a/tests/by-util/test_fold.rs b/tests/by-util/test_fold.rs index 9497044c910..1fe466ba5ad 100644 --- a/tests/by-util/test_fold.rs +++ b/tests/by-util/test_fold.rs @@ -4,7 +4,11 @@ // file that was distributed with this source code. // spell-checker:ignore fullwidth +use bytecount::count; +use unicode_width::UnicodeWidthChar; use uutests::new_ucmd; +use uutests::util::TestScenario; +use uutests::util_name; #[test] fn test_invalid_arg() { @@ -61,6 +65,301 @@ fn test_wide_characters_with_characters_option() { .stdout_is("\u{B250}\u{B250}\u{B250}\n"); } +#[test] +fn test_multiple_wide_characters_in_column_mode() { + let wide = '\u{FF1A}'; + let mut input = wide.to_string().repeat(50); + input.push('\n'); + + let mut expected = String::new(); + for i in 1..=50 { + expected.push(wide); + if i % 5 == 0 { + expected.push('\n'); + } + } + + new_ucmd!() + .args(&["-w", "10"]) + .pipe_in(input) + .succeeds() + .stdout_is(expected); +} + +#[test] +fn test_multiple_wide_characters_in_character_mode() { + let wide = '\u{FF1A}'; + let mut input = wide.to_string().repeat(50); + input.push('\n'); + + let mut expected = String::new(); + for i in 1..=50 { + expected.push(wide); + if i % 10 == 0 { + expected.push('\n'); + } + } + + new_ucmd!() + .args(&["--characters", "-w", "10"]) + .pipe_in(input) + .succeeds() + .stdout_is(expected); +} + +#[test] +fn test_unicode_on_reader_buffer_boundary_in_character_mode() { + let boundary = buf_reader_capacity().saturating_sub(1); + assert!(boundary > 0, "BufReader capacity must be greater than 1"); + + let mut input = "a".repeat(boundary); + input.push('\u{B250}'); + input.push_str(&"a".repeat(100)); + input.push('\n'); + + let expected_tail = tail_inclusive(&fold_characters_reference(&input, 80), 4); + + let result = new_ucmd!().arg("--characters").pipe_in(input).succeeds(); + + let actual_tail = tail_inclusive(result.stdout_str(), 4); + + assert_eq!(actual_tail, expected_tail); +} + +#[test] +fn test_fold_preserves_invalid_utf8_sequences() { + let bad_input: &[u8] = b"\xC3|\xED\xBA\xAD|\x00|\x89|\xED\xA6\xBF\xED\xBF\xBF\n"; + + new_ucmd!() + .pipe_in(bad_input.to_vec()) + .succeeds() + .stdout_is_bytes(bad_input); +} + +#[test] +fn test_fold_preserves_incomplete_utf8_at_eof() { + let trailing_byte: &[u8] = b"\xC3"; + + new_ucmd!() + .pipe_in(trailing_byte.to_vec()) + .succeeds() + .stdout_is_bytes(trailing_byte); +} + +#[test] +fn test_zero_width_bytes_in_column_mode() { + let len = io_buf_size_times_two(); + let input = vec![0u8; len]; + + new_ucmd!() + .pipe_in(input.clone()) + .succeeds() + .stdout_is_bytes(input); +} + +#[test] +fn test_zero_width_bytes_in_character_mode() { + let len = io_buf_size_times_two(); + let input = vec![0u8; len]; + let expected = fold_characters_reference_bytes(&input, 80); + + new_ucmd!() + .args(&["--characters"]) + .pipe_in(input) + .succeeds() + .stdout_is_bytes(expected); +} + +#[test] +fn test_zero_width_spaces_in_column_mode() { + let len = io_buf_size_times_two(); + let input = "\u{200B}".repeat(len); + + new_ucmd!() + .pipe_in(input.clone()) + .succeeds() + .stdout_is(&input); +} + +#[test] +fn test_zero_width_spaces_in_character_mode() { + let len = io_buf_size_times_two(); + let input = "\u{200B}".repeat(len); + let expected = fold_characters_reference(&input, 80); + + new_ucmd!() + .args(&["--characters"]) + .pipe_in(input) + .succeeds() + .stdout_is(&expected); +} + +#[test] +fn test_zero_width_bytes_from_file() { + let len = io_buf_size_times_two(); + let input = vec![0u8; len]; + let expected = fold_characters_reference_bytes(&input, 80); + + let ts = TestScenario::new(util_name!()); + let path = "zeros.bin"; + ts.fixtures.write_bytes(path, &input); + + ts.ucmd().arg(path).succeeds().stdout_is_bytes(&input); + + ts.ucmd() + .args(&["--characters", path]) + .succeeds() + .stdout_is_bytes(expected); +} + +#[test] +fn test_zero_width_spaces_from_file() { + let len = io_buf_size_times_two(); + let input = "\u{200B}".repeat(len); + let expected = fold_characters_reference(&input, 80); + + let ts = TestScenario::new(util_name!()); + let path = "zero-width.txt"; + ts.fixtures.write(path, &input); + + ts.ucmd().arg(path).succeeds().stdout_is(&input); + + ts.ucmd() + .args(&["--characters", path]) + .succeeds() + .stdout_is(&expected); +} + +#[test] +fn test_zero_width_data_line_counts() { + let len = io_buf_size_times_two(); + + let zero_bytes = vec![0u8; len]; + let column_bytes = new_ucmd!().pipe_in(zero_bytes.clone()).succeeds(); + assert_eq!( + newline_count(column_bytes.stdout()), + 0, + "fold should not wrap zero-width bytes in column mode", + ); + + let characters_bytes = new_ucmd!() + .args(&["--characters"]) + .pipe_in(zero_bytes) + .succeeds(); + assert_eq!( + newline_count(characters_bytes.stdout()), + len / 80, + "fold --characters should wrap zero-width bytes every 80 bytes", + ); + + if UnicodeWidthChar::width('\u{200B}') != Some(0) { + eprintln!("skip zero width space checks because width != 0"); + return; + } + + let zero_width_spaces = "\u{200B}".repeat(len); + let column_spaces = new_ucmd!().pipe_in(zero_width_spaces.clone()).succeeds(); + assert_eq!( + newline_count(column_spaces.stdout()), + 0, + "fold should keep zero-width spaces on a single line in column mode", + ); + + let characters_spaces = new_ucmd!() + .args(&["--characters"]) + .pipe_in(zero_width_spaces) + .succeeds(); + assert_eq!( + newline_count(characters_spaces.stdout()), + len / 80, + "fold --characters should wrap zero-width spaces every 80 characters", + ); +} + +#[cfg(any(target_os = "linux", target_os = "freebsd", target_os = "netbsd"))] +#[test] +fn test_fold_reports_no_space_left_on_dev_full() { + use std::fs::OpenOptions; + use std::process::Stdio; + + for &byte in &[b'\n', b'\0', 0xC3u8] { + let dev_full = OpenOptions::new() + .write(true) + .open("/dev/full") + .expect("/dev/full must exist on supported targets"); + + new_ucmd!() + .pipe_in(vec![byte; 1024]) + .set_stdout(Stdio::from(dev_full)) + .fails() + .stderr_contains("No space left"); + } +} + +fn buf_reader_capacity() -> usize { + std::io::BufReader::new(&b""[..]).capacity() +} + +fn io_buf_size_times_two() -> usize { + buf_reader_capacity() + .checked_mul(2) + .expect("BufReader capacity overflow") +} + +fn fold_characters_reference(input: &str, width: usize) -> String { + let mut output = String::with_capacity(input.len()); + let mut col_count = 0usize; + + for ch in input.chars() { + if ch == '\n' { + output.push('\n'); + col_count = 0; + continue; + } + + if col_count >= width { + output.push('\n'); + col_count = 0; + } + + output.push(ch); + col_count += 1; + } + + output +} + +fn fold_characters_reference_bytes(input: &[u8], width: usize) -> Vec { + let mut output = Vec::with_capacity(input.len() + input.len() / width + 1); + + for chunk in input.chunks(width) { + output.extend_from_slice(chunk); + if chunk.len() == width { + output.push(b'\n'); + } + } + + output +} + +fn newline_count(bytes: &[u8]) -> usize { + count(bytes, b'\n') +} + +fn tail_inclusive(text: &str, lines: usize) -> String { + if lines == 0 { + return String::new(); + } + + let segments: Vec<&str> = text.split_inclusive('\n').collect(); + if segments.is_empty() { + return text.to_owned(); + } + + let start = segments.len().saturating_sub(lines); + segments[start..].concat() +} + #[test] fn test_should_preserve_empty_line_without_final_newline() { new_ucmd!()