-
-
Notifications
You must be signed in to change notification settings - Fork 1.7k
fold:fix gnu test fold-zero-width.sh #9274
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
6d721e5
b21cf35
77a1c31
b0e0033
ac09d10
dba5b9b
556804c
5424072
a1bbd40
d176e65
dbad8db
cd6f536
75c542f
662a011
3b07c8e
1d35753
eb7d8c4
b9b46be
5e1d7f5
e54a74b
5de4b28
0605e24
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,8 @@ const TAB_WIDTH: usize = 8; | |
| const NL: u8 = b'\n'; | ||
| const CR: u8 = b'\r'; | ||
| const TAB: u8 = b'\t'; | ||
| // Implementation threshold (8 KiB) to prevent unbounded buffer growth during streaming. | ||
| const STREAMING_FLUSH_THRESHOLD: usize = 8 * 1024; | ||
|
|
||
| mod options { | ||
| pub const BYTES: &str = "bytes"; | ||
|
|
@@ -288,6 +290,8 @@ fn compute_col_count(buffer: &[u8], mode: WidthMode) -> usize { | |
| } | ||
|
|
||
| fn emit_output<W: Write>(ctx: &mut FoldContext<'_, W>) -> UResult<()> { | ||
| // Emit a folded line and keep the remaining buffer (if any) for the next line. | ||
| // When `-s` is active, we prefer breaking at the last recorded whitespace. | ||
| let consume = match *ctx.last_space { | ||
| Some(index) => index + 1, | ||
| None => ctx.output.len(), | ||
|
|
@@ -322,6 +326,36 @@ fn emit_output<W: Write>(ctx: &mut FoldContext<'_, W>) -> UResult<()> { | |
| Ok(()) | ||
| } | ||
|
|
||
| fn maybe_flush_unbroken_output<W: Write>(ctx: &mut FoldContext<'_, W>) -> UResult<()> { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please add some comments to explain what these functions are doing |
||
| // In streaming mode without `-s`, avoid unbounded buffering by periodically | ||
| // flushing long unbroken output segments. When `-s` is enabled we must keep | ||
| // the buffer to preserve the last whitespace boundary for folding. | ||
| if ctx.spaces || ctx.output.len() < STREAMING_FLUSH_THRESHOLD { | ||
| return Ok(()); | ||
| } | ||
|
|
||
| if !ctx.output.is_empty() { | ||
| ctx.writer.write_all(ctx.output)?; | ||
| ctx.output.clear(); | ||
| } | ||
| Ok(()) | ||
| } | ||
|
|
||
| fn push_byte<W: Write>(ctx: &mut FoldContext<'_, W>, byte: u8) -> UResult<()> { | ||
| // Append a single byte and flush if the buffer grows too large. | ||
| ctx.output.push(byte); | ||
| maybe_flush_unbroken_output(ctx) | ||
| } | ||
|
|
||
| fn push_bytes<W: Write>(ctx: &mut FoldContext<'_, W>, bytes: &[u8]) -> UResult<()> { | ||
| // Append a byte slice and flush if the buffer grows too large. | ||
| if bytes.is_empty() { | ||
| return Ok(()); | ||
| } | ||
| ctx.output.extend_from_slice(bytes); | ||
| maybe_flush_unbroken_output(ctx) | ||
| } | ||
|
|
||
| fn process_ascii_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> { | ||
| let mut idx = 0; | ||
| let len = line.len(); | ||
|
|
@@ -331,15 +365,15 @@ fn process_ascii_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UR | |
| NL => { | ||
| *ctx.last_space = None; | ||
| emit_output(ctx)?; | ||
| break; | ||
| idx += 1; | ||
| } | ||
| CR => { | ||
| ctx.output.push(CR); | ||
| push_byte(ctx, CR)?; | ||
| *ctx.col_count = 0; | ||
| idx += 1; | ||
| } | ||
| 0x08 => { | ||
| ctx.output.push(0x08); | ||
| push_byte(ctx, 0x08)?; | ||
| *ctx.col_count = ctx.col_count.saturating_sub(1); | ||
| idx += 1; | ||
| } | ||
|
|
@@ -358,16 +392,23 @@ fn process_ascii_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UR | |
| } else { | ||
| *ctx.last_space = None; | ||
| } | ||
| ctx.output.push(TAB); | ||
| push_byte(ctx, TAB)?; | ||
| idx += 1; | ||
| } | ||
| 0x00..=0x07 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F => { | ||
| ctx.output.push(line[idx]); | ||
| push_byte(ctx, line[idx])?; | ||
| if ctx.spaces && line[idx].is_ascii_whitespace() && line[idx] != CR { | ||
| *ctx.last_space = Some(ctx.output.len() - 1); | ||
| } else if !ctx.spaces { | ||
| *ctx.last_space = None; | ||
| } | ||
|
|
||
| if ctx.mode == WidthMode::Characters { | ||
| *ctx.col_count = ctx.col_count.saturating_add(1); | ||
| if *ctx.col_count >= ctx.width { | ||
| emit_output(ctx)?; | ||
| } | ||
| } | ||
| idx += 1; | ||
| } | ||
| _ => { | ||
|
|
@@ -405,7 +446,7 @@ fn push_ascii_segment<W: Write>(segment: &[u8], ctx: &mut FoldContext<'_, W>) -> | |
| let take = remaining.len().min(available); | ||
| let base_len = ctx.output.len(); | ||
|
|
||
| ctx.output.extend_from_slice(&remaining[..take]); | ||
| push_bytes(ctx, &remaining[..take])?; | ||
| *ctx.col_count += take; | ||
|
|
||
| if ctx.spaces { | ||
|
|
@@ -430,16 +471,26 @@ fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> URes | |
| return process_ascii_line(line.as_bytes(), ctx); | ||
| } | ||
|
|
||
| process_utf8_chars(line, ctx) | ||
| } | ||
|
|
||
| fn process_utf8_chars<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> UResult<()> { | ||
| let line_bytes = line.as_bytes(); | ||
| let mut iter = line.char_indices().peekable(); | ||
|
|
||
| while let Some((byte_idx, ch)) = iter.next() { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe move that into a function
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fix |
||
| // Include combining characters with the base character | ||
| while let Some(&(_, next_ch)) = iter.peek() { | ||
| if unicode_width::UnicodeWidthChar::width(next_ch).unwrap_or(1) == 0 { | ||
| iter.next(); | ||
| } else { | ||
| break; | ||
| // Include combining characters with the base character when we are | ||
| // measuring by display columns. In character-counting mode every | ||
| // scalar value must advance the counter to match `chars().count()` | ||
| // semantics (see `fold_characters_reference` in the tests), so we do | ||
| // not coalesce zero-width scalars there. | ||
| if ctx.mode == WidthMode::Columns { | ||
| while let Some(&(_, next_ch)) = iter.peek() { | ||
| if unicode_width::UnicodeWidthChar::width(next_ch).unwrap_or(1) == 0 { | ||
| iter.next(); | ||
| } else { | ||
| break; | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -448,23 +499,21 @@ fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> URes | |
| if ch == '\n' { | ||
| *ctx.last_space = None; | ||
| emit_output(ctx)?; | ||
| break; | ||
| continue; | ||
| } | ||
|
|
||
| if *ctx.col_count >= ctx.width { | ||
| emit_output(ctx)?; | ||
| } | ||
|
|
||
| if ch == '\r' { | ||
| ctx.output | ||
| .extend_from_slice(&line_bytes[byte_idx..next_idx]); | ||
| push_bytes(ctx, &line_bytes[byte_idx..next_idx])?; | ||
| *ctx.col_count = 0; | ||
| continue; | ||
| } | ||
|
|
||
| if ch == '\x08' { | ||
| ctx.output | ||
| .extend_from_slice(&line_bytes[byte_idx..next_idx]); | ||
| push_bytes(ctx, &line_bytes[byte_idx..next_idx])?; | ||
| *ctx.col_count = ctx.col_count.saturating_sub(1); | ||
| continue; | ||
| } | ||
|
|
@@ -484,8 +533,7 @@ fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> URes | |
| } else { | ||
| *ctx.last_space = None; | ||
| } | ||
| ctx.output | ||
| .extend_from_slice(&line_bytes[byte_idx..next_idx]); | ||
| push_bytes(ctx, &line_bytes[byte_idx..next_idx])?; | ||
| continue; | ||
| } | ||
|
|
||
|
|
@@ -506,8 +554,7 @@ fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> URes | |
| *ctx.last_space = Some(ctx.output.len()); | ||
| } | ||
|
|
||
| ctx.output | ||
| .extend_from_slice(&line_bytes[byte_idx..next_idx]); | ||
| push_bytes(ctx, &line_bytes[byte_idx..next_idx])?; | ||
| *ctx.col_count = ctx.col_count.saturating_add(added); | ||
| } | ||
|
|
||
|
|
@@ -519,7 +566,7 @@ fn process_non_utf8_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> | |
| if byte == NL { | ||
| *ctx.last_space = None; | ||
| emit_output(ctx)?; | ||
| break; | ||
| continue; | ||
| } | ||
|
|
||
| if *ctx.col_count >= ctx.width { | ||
|
|
@@ -539,7 +586,7 @@ fn process_non_utf8_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> | |
| } else { | ||
| None | ||
| }; | ||
| ctx.output.push(byte); | ||
| push_byte(ctx, byte)?; | ||
| continue; | ||
| } | ||
| 0x08 => *ctx.col_count = ctx.col_count.saturating_sub(1), | ||
|
|
@@ -550,7 +597,46 @@ fn process_non_utf8_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> | |
| _ => *ctx.col_count = ctx.col_count.saturating_add(1), | ||
| } | ||
|
|
||
| ctx.output.push(byte); | ||
| push_byte(ctx, byte)?; | ||
| } | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| /// Process buffered bytes, emitting output for valid UTF-8 prefixes and | ||
| /// deferring incomplete sequences until more input arrives. | ||
| /// | ||
| /// If the buffer contains invalid UTF-8, it is handled in non-UTF-8 mode and | ||
| /// the buffer is fully consumed. | ||
| fn process_pending_chunk<W: Write>( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please document this function
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add |
||
| pending: &mut Vec<u8>, | ||
| ctx: &mut FoldContext<'_, W>, | ||
| ) -> UResult<()> { | ||
| while !pending.is_empty() { | ||
| match std::str::from_utf8(pending) { | ||
| Ok(valid) => { | ||
| process_utf8_line(valid, ctx)?; | ||
| pending.clear(); | ||
| break; | ||
| } | ||
| Err(err) => { | ||
| if err.error_len().is_some() { | ||
| let res = process_non_utf8_line(pending, ctx); | ||
| pending.clear(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this won't be executed if the previous line fails, no ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fix |
||
| res?; | ||
| break; | ||
| } | ||
|
|
||
| let valid_up_to = err.valid_up_to(); | ||
| if valid_up_to == 0 { | ||
| break; | ||
| } | ||
|
|
||
| let valid = std::str::from_utf8(&pending[..valid_up_to]).expect("valid prefix"); | ||
| process_utf8_line(valid, ctx)?; | ||
| pending.drain(..valid_up_to); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| Ok(()) | ||
|
|
@@ -572,20 +658,12 @@ fn fold_file<T: Read, W: Write>( | |
| mode: WidthMode, | ||
| writer: &mut W, | ||
| ) -> UResult<()> { | ||
| let mut line = Vec::new(); | ||
| let mut output = Vec::new(); | ||
| let mut col_count = 0; | ||
| let mut last_space = None; | ||
| let mut pending = Vec::new(); | ||
|
|
||
| loop { | ||
| if file | ||
| .read_until(NL, &mut line) | ||
| .map_err_context(|| translate!("fold-error-readline"))? | ||
| == 0 | ||
| { | ||
| break; | ||
| } | ||
|
|
||
| { | ||
| let mut ctx = FoldContext { | ||
| spaces, | ||
| width, | ||
|
|
@@ -596,17 +674,32 @@ fn fold_file<T: Read, W: Write>( | |
| last_space: &mut last_space, | ||
| }; | ||
|
|
||
| match std::str::from_utf8(&line) { | ||
| Ok(s) => process_utf8_line(s, &mut ctx)?, | ||
| Err(_) => process_non_utf8_line(&line, &mut ctx)?, | ||
| loop { | ||
| let buffer = file | ||
| .fill_buf() | ||
| .map_err_context(|| translate!("fold-error-readline"))?; | ||
| if buffer.is_empty() { | ||
| break; | ||
| } | ||
| pending.extend_from_slice(buffer); | ||
| let consumed = buffer.len(); | ||
| file.consume(consumed); | ||
|
|
||
| process_pending_chunk(&mut pending, &mut ctx)?; | ||
| } | ||
|
|
||
| line.clear(); | ||
| } | ||
| if !pending.is_empty() { | ||
| match std::str::from_utf8(&pending) { | ||
| Ok(s) => process_utf8_line(s, &mut ctx)?, | ||
| Err(_) => process_non_utf8_line(&pending, &mut ctx)?, | ||
| } | ||
| pending.clear(); | ||
| } | ||
|
|
||
| if !output.is_empty() { | ||
| writer.write_all(&output)?; | ||
| output.clear(); | ||
| if !ctx.output.is_empty() { | ||
| ctx.writer.write_all(ctx.output)?; | ||
| ctx.output.clear(); | ||
| } | ||
| } | ||
|
|
||
| Ok(()) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please document this magic number