Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
6d721e5
feat(fold): add column counting for character width mode in process_a…
mattsu2020 Nov 14, 2025
b21cf35
fix fold: emit output early when column count reaches width limit
mattsu2020 Nov 14, 2025
77a1c31
fix: correct output emission logic in fold for character mode
mattsu2020 Nov 14, 2025
b0e0033
refactor(fold): split long if-conditions into multiple lines for read…
mattsu2020 Nov 14, 2025
ac09d10
feat(fold): add streaming output with periodic flushing to reduce mem…
mattsu2020 Nov 14, 2025
dba5b9b
fix(fold): correct premature output emission in character mode and ad…
mattsu2020 Nov 14, 2025
556804c
refactor: clean up formatting in fold utility and tests
mattsu2020 Nov 14, 2025
5424072
Merge branch 'main' into fold_compatibility
mattsu2020 Nov 15, 2025
a1bbd40
feat(fold): add unicode-width dependency and tests for zero-width cha…
mattsu2020 Nov 15, 2025
d176e65
perf: use bytecount for efficient newline counting in fold tests
mattsu2020 Nov 15, 2025
dbad8db
refactor: Handle zero-width bytes across buffer boundaries in fold
mattsu2020 Nov 15, 2025
cd6f536
refactor(fold): streamline process_pending_chunk loop and error handling
mattsu2020 Nov 15, 2025
75c542f
refactor(fold): condense variable assignment to single line
mattsu2020 Nov 15, 2025
662a011
Merge branch 'main' into fold_compatibility
mattsu2020 Nov 16, 2025
3b07c8e
Merge branch 'main' into fold_compatibility
mattsu2020 Nov 17, 2025
1d35753
Merge branch 'main' into fold_compatibility
mattsu2020 Nov 20, 2025
eb7d8c4
fix(fold): properly handle combining characters in character-counting…
mattsu2020 Nov 20, 2025
b9b46be
Merge branch 'main' into fold_compatibility
mattsu2020 Dec 1, 2025
5e1d7f5
Merge branch 'main' into fold_compatibility
mattsu2020 Dec 22, 2025
e54a74b
refactor(fold): add comments to explain streaming flush logic and pre…
mattsu2020 Dec 22, 2025
5de4b28
refactor(fold): extract UTF-8 char processing and improve pending chu…
mattsu2020 Dec 22, 2025
0605e24
Merge branch 'main' into fold_compatibility
mattsu2020 Dec 25, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -535,13 +535,15 @@ ctor.workspace = true
filetime.workspace = true
glob.workspace = true
libc.workspace = true
bytecount.workspace = true
num-prime.workspace = true
pretty_assertions = "1.4.0"
rand.workspace = true
regex.workspace = true
sha1 = { workspace = true, features = ["std"] }
tempfile.workspace = true
time = { workspace = true, features = ["local-offset"] }
unicode-width.workspace = true
unindent = "0.2.3"
uutests.workspace = true
uucore = { workspace = true, features = [
Expand Down
177 changes: 135 additions & 42 deletions src/uu/fold/src/fold.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ const TAB_WIDTH: usize = 8;
const NL: u8 = b'\n';
const CR: u8 = b'\r';
const TAB: u8 = b'\t';
// Implementation threshold (8 KiB) to prevent unbounded buffer growth during streaming.
const STREAMING_FLUSH_THRESHOLD: usize = 8 * 1024;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please document this magic number


mod options {
pub const BYTES: &str = "bytes";
Expand Down Expand Up @@ -288,6 +290,8 @@ fn compute_col_count(buffer: &[u8], mode: WidthMode) -> usize {
}

fn emit_output<W: Write>(ctx: &mut FoldContext<'_, W>) -> UResult<()> {
// Emit a folded line and keep the remaining buffer (if any) for the next line.
// When `-s` is active, we prefer breaking at the last recorded whitespace.
let consume = match *ctx.last_space {
Some(index) => index + 1,
None => ctx.output.len(),
Expand Down Expand Up @@ -322,6 +326,36 @@ fn emit_output<W: Write>(ctx: &mut FoldContext<'_, W>) -> UResult<()> {
Ok(())
}

fn maybe_flush_unbroken_output<W: Write>(ctx: &mut FoldContext<'_, W>) -> UResult<()> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add some comments to explain what these functions are doing

// In streaming mode without `-s`, avoid unbounded buffering by periodically
// flushing long unbroken output segments. When `-s` is enabled we must keep
// the buffer to preserve the last whitespace boundary for folding.
if ctx.spaces || ctx.output.len() < STREAMING_FLUSH_THRESHOLD {
return Ok(());
}

if !ctx.output.is_empty() {
ctx.writer.write_all(ctx.output)?;
ctx.output.clear();
}
Ok(())
}

fn push_byte<W: Write>(ctx: &mut FoldContext<'_, W>, byte: u8) -> UResult<()> {
// Append a single byte and flush if the buffer grows too large.
ctx.output.push(byte);
maybe_flush_unbroken_output(ctx)
}

fn push_bytes<W: Write>(ctx: &mut FoldContext<'_, W>, bytes: &[u8]) -> UResult<()> {
// Append a byte slice and flush if the buffer grows too large.
if bytes.is_empty() {
return Ok(());
}
ctx.output.extend_from_slice(bytes);
maybe_flush_unbroken_output(ctx)
}

fn process_ascii_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> {
let mut idx = 0;
let len = line.len();
Expand All @@ -331,15 +365,15 @@ fn process_ascii_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UR
NL => {
*ctx.last_space = None;
emit_output(ctx)?;
break;
idx += 1;
}
CR => {
ctx.output.push(CR);
push_byte(ctx, CR)?;
*ctx.col_count = 0;
idx += 1;
}
0x08 => {
ctx.output.push(0x08);
push_byte(ctx, 0x08)?;
*ctx.col_count = ctx.col_count.saturating_sub(1);
idx += 1;
}
Expand All @@ -358,16 +392,23 @@ fn process_ascii_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UR
} else {
*ctx.last_space = None;
}
ctx.output.push(TAB);
push_byte(ctx, TAB)?;
idx += 1;
}
0x00..=0x07 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F => {
ctx.output.push(line[idx]);
push_byte(ctx, line[idx])?;
if ctx.spaces && line[idx].is_ascii_whitespace() && line[idx] != CR {
*ctx.last_space = Some(ctx.output.len() - 1);
} else if !ctx.spaces {
*ctx.last_space = None;
}

if ctx.mode == WidthMode::Characters {
*ctx.col_count = ctx.col_count.saturating_add(1);
if *ctx.col_count >= ctx.width {
emit_output(ctx)?;
}
}
idx += 1;
}
_ => {
Expand Down Expand Up @@ -405,7 +446,7 @@ fn push_ascii_segment<W: Write>(segment: &[u8], ctx: &mut FoldContext<'_, W>) ->
let take = remaining.len().min(available);
let base_len = ctx.output.len();

ctx.output.extend_from_slice(&remaining[..take]);
push_bytes(ctx, &remaining[..take])?;
*ctx.col_count += take;

if ctx.spaces {
Expand All @@ -430,16 +471,26 @@ fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> URes
return process_ascii_line(line.as_bytes(), ctx);
}

process_utf8_chars(line, ctx)
}

fn process_utf8_chars<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> UResult<()> {
let line_bytes = line.as_bytes();
let mut iter = line.char_indices().peekable();

while let Some((byte_idx, ch)) = iter.next() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe move that into a function

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fix

// Include combining characters with the base character
while let Some(&(_, next_ch)) = iter.peek() {
if unicode_width::UnicodeWidthChar::width(next_ch).unwrap_or(1) == 0 {
iter.next();
} else {
break;
// Include combining characters with the base character when we are
// measuring by display columns. In character-counting mode every
// scalar value must advance the counter to match `chars().count()`
// semantics (see `fold_characters_reference` in the tests), so we do
// not coalesce zero-width scalars there.
if ctx.mode == WidthMode::Columns {
while let Some(&(_, next_ch)) = iter.peek() {
if unicode_width::UnicodeWidthChar::width(next_ch).unwrap_or(1) == 0 {
iter.next();
} else {
break;
}
}
}

Expand All @@ -448,23 +499,21 @@ fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> URes
if ch == '\n' {
*ctx.last_space = None;
emit_output(ctx)?;
break;
continue;
}

if *ctx.col_count >= ctx.width {
emit_output(ctx)?;
}

if ch == '\r' {
ctx.output
.extend_from_slice(&line_bytes[byte_idx..next_idx]);
push_bytes(ctx, &line_bytes[byte_idx..next_idx])?;
*ctx.col_count = 0;
continue;
}

if ch == '\x08' {
ctx.output
.extend_from_slice(&line_bytes[byte_idx..next_idx]);
push_bytes(ctx, &line_bytes[byte_idx..next_idx])?;
*ctx.col_count = ctx.col_count.saturating_sub(1);
continue;
}
Expand All @@ -484,8 +533,7 @@ fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> URes
} else {
*ctx.last_space = None;
}
ctx.output
.extend_from_slice(&line_bytes[byte_idx..next_idx]);
push_bytes(ctx, &line_bytes[byte_idx..next_idx])?;
continue;
}

Expand All @@ -506,8 +554,7 @@ fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> URes
*ctx.last_space = Some(ctx.output.len());
}

ctx.output
.extend_from_slice(&line_bytes[byte_idx..next_idx]);
push_bytes(ctx, &line_bytes[byte_idx..next_idx])?;
*ctx.col_count = ctx.col_count.saturating_add(added);
}

Expand All @@ -519,7 +566,7 @@ fn process_non_utf8_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) ->
if byte == NL {
*ctx.last_space = None;
emit_output(ctx)?;
break;
continue;
}

if *ctx.col_count >= ctx.width {
Expand All @@ -539,7 +586,7 @@ fn process_non_utf8_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) ->
} else {
None
};
ctx.output.push(byte);
push_byte(ctx, byte)?;
continue;
}
0x08 => *ctx.col_count = ctx.col_count.saturating_sub(1),
Expand All @@ -550,7 +597,46 @@ fn process_non_utf8_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) ->
_ => *ctx.col_count = ctx.col_count.saturating_add(1),
}

ctx.output.push(byte);
push_byte(ctx, byte)?;
}

Ok(())
}

/// Process buffered bytes, emitting output for valid UTF-8 prefixes and
/// deferring incomplete sequences until more input arrives.
///
/// If the buffer contains invalid UTF-8, it is handled in non-UTF-8 mode and
/// the buffer is fully consumed.
fn process_pending_chunk<W: Write>(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please document this function

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add

pending: &mut Vec<u8>,
ctx: &mut FoldContext<'_, W>,
) -> UResult<()> {
while !pending.is_empty() {
match std::str::from_utf8(pending) {
Ok(valid) => {
process_utf8_line(valid, ctx)?;
pending.clear();
break;
}
Err(err) => {
if err.error_len().is_some() {
let res = process_non_utf8_line(pending, ctx);
pending.clear();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this won't be executed if the previous line fails, no ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fix

res?;
break;
}

let valid_up_to = err.valid_up_to();
if valid_up_to == 0 {
break;
}

let valid = std::str::from_utf8(&pending[..valid_up_to]).expect("valid prefix");
process_utf8_line(valid, ctx)?;
pending.drain(..valid_up_to);
}
}
}

Ok(())
Expand All @@ -572,20 +658,12 @@ fn fold_file<T: Read, W: Write>(
mode: WidthMode,
writer: &mut W,
) -> UResult<()> {
let mut line = Vec::new();
let mut output = Vec::new();
let mut col_count = 0;
let mut last_space = None;
let mut pending = Vec::new();

loop {
if file
.read_until(NL, &mut line)
.map_err_context(|| translate!("fold-error-readline"))?
== 0
{
break;
}

{
let mut ctx = FoldContext {
spaces,
width,
Expand All @@ -596,17 +674,32 @@ fn fold_file<T: Read, W: Write>(
last_space: &mut last_space,
};

match std::str::from_utf8(&line) {
Ok(s) => process_utf8_line(s, &mut ctx)?,
Err(_) => process_non_utf8_line(&line, &mut ctx)?,
loop {
let buffer = file
.fill_buf()
.map_err_context(|| translate!("fold-error-readline"))?;
if buffer.is_empty() {
break;
}
pending.extend_from_slice(buffer);
let consumed = buffer.len();
file.consume(consumed);

process_pending_chunk(&mut pending, &mut ctx)?;
}

line.clear();
}
if !pending.is_empty() {
match std::str::from_utf8(&pending) {
Ok(s) => process_utf8_line(s, &mut ctx)?,
Err(_) => process_non_utf8_line(&pending, &mut ctx)?,
}
pending.clear();
}

if !output.is_empty() {
writer.write_all(&output)?;
output.clear();
if !ctx.output.is_empty() {
ctx.writer.write_all(ctx.output)?;
ctx.output.clear();
}
}

Ok(())
Expand Down
Loading
Loading