Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
604 changes: 326 additions & 278 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions deny.toml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ skip = [
{ name = "itertools", version = "0.13.0" },
# ordered-multimap
{ name = "hashbrown", version = "0.14.5" },
# lru (via num-prime)
{ name = "hashbrown", version = "0.15.5" },
# cexpr (via bindgen)
{ name = "nom", version = "7.1.3" },
# const-random-macro, rand_core
Expand Down
4 changes: 3 additions & 1 deletion src/uu/sort/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ bigdecimal = { workspace = true }
binary-heap-plus = { workspace = true }
clap = { workspace = true }
compare = { workspace = true }
ctrlc = { workspace = true }
fnv = { workspace = true }
itertools = { workspace = true }
memchr = { workspace = true }
Expand All @@ -42,6 +41,9 @@ uucore = { workspace = true, features = [
] }
fluent = { workspace = true }

[target.'cfg(not(target_os = "redox"))'.dependencies]
ctrlc = { workspace = true }

[target.'cfg(unix)'.dependencies]
nix = { workspace = true, features = ["resource"] }

Expand Down
1 change: 1 addition & 0 deletions src/uu/sort/locales/en-US.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ sort-help-numeric = compare according to string numerical value
sort-help-general-numeric = compare according to string general numerical value
sort-help-version-sort = Sort by SemVer version number, eg 1.12.2 > 1.1.2
sort-help-random = shuffle in random order
sort-help-random-source = use FILE as a source of random data
sort-help-dictionary-order = consider only blanks and alphanumeric characters
sort-help-merge = merge already sorted files; do not sort
sort-help-check = check for sorted input; do not sort
Expand Down
1 change: 1 addition & 0 deletions src/uu/sort/locales/fr-FR.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ sort-help-numeric = compare selon la valeur numérique de la chaîne
sort-help-general-numeric = compare selon la valeur numérique générale de la chaîne
sort-help-version-sort = Trie par numéro de version SemVer, par ex. 1.12.2 > 1.1.2
sort-help-random = mélange dans un ordre aléatoire
sort-help-random-source = utilise FICHIER comme source de données aléatoires
sort-help-dictionary-order = considère seulement les espaces et les caractères alphanumériques
sort-help-merge = fusionne les fichiers déjà triés ; ne trie pas
sort-help-check = vérifie l'entrée triée ; ne trie pas
Expand Down
96 changes: 87 additions & 9 deletions src/uu/sort/src/chunks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,26 @@

//! Utilities for reading files as chunks.

// spell-checker:ignore ELEMS
#![allow(dead_code)]
// Ignores non-used warning for `borrow_buffer` in `Chunk`

use std::{
io::{ErrorKind, Read},
ops::Range,
sync::mpsc::SyncSender,
};

use memchr::memchr_iter;
use self_cell::self_cell;
use uucore::error::{UResult, USimpleError};

use crate::{GeneralBigDecimalParseResult, GlobalSettings, Line, numeric_str_cmp::NumInfo};
use crate::{
GeneralBigDecimalParseResult, GlobalSettings, Line, SortMode, numeric_str_cmp::NumInfo,
};

const MAX_TOKEN_BUFFER_BYTES: usize = 4 * 1024 * 1024;
const MAX_TOKEN_BUFFER_ELEMS: usize = MAX_TOKEN_BUFFER_BYTES / std::mem::size_of::<Range<usize>>();

self_cell!(
/// The chunk that is passed around between threads.
Expand All @@ -35,6 +42,8 @@ self_cell!(
pub struct ChunkContents<'a> {
pub lines: Vec<Line<'a>>,
pub line_data: LineData<'a>,
pub token_buffer: Vec<Range<usize>>,
pub line_count_hint: usize,
}

#[derive(Debug)]
Expand All @@ -54,6 +63,7 @@ impl Chunk {
contents.line_data.num_infos.clear();
contents.line_data.parsed_floats.clear();
contents.line_data.line_num_floats.clear();
contents.token_buffer.clear();
let lines = unsafe {
// SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime,
// because the vector is empty.
Expand All @@ -76,6 +86,8 @@ impl Chunk {
std::mem::take(&mut contents.line_data.num_infos),
std::mem::take(&mut contents.line_data.parsed_floats),
std::mem::take(&mut contents.line_data.line_num_floats),
std::mem::take(&mut contents.token_buffer),
contents.line_count_hint,
)
});
RecycledChunk {
Expand All @@ -84,6 +96,8 @@ impl Chunk {
num_infos: recycled_contents.2,
parsed_floats: recycled_contents.3,
line_num_floats: recycled_contents.4,
token_buffer: recycled_contents.5,
line_count_hint: recycled_contents.6,
buffer: self.into_owner(),
}
}
Expand All @@ -103,6 +117,8 @@ pub struct RecycledChunk {
num_infos: Vec<NumInfo>,
parsed_floats: Vec<GeneralBigDecimalParseResult>,
line_num_floats: Vec<Option<f64>>,
token_buffer: Vec<Range<usize>>,
line_count_hint: usize,
buffer: Vec<u8>,
}

Expand All @@ -114,6 +130,8 @@ impl RecycledChunk {
num_infos: Vec::new(),
parsed_floats: Vec::new(),
line_num_floats: Vec::new(),
token_buffer: Vec::new(),
line_count_hint: 0,
buffer: vec![0; capacity],
}
}
Expand Down Expand Up @@ -157,6 +175,8 @@ pub fn read<T: Read>(
num_infos,
parsed_floats,
line_num_floats,
mut token_buffer,
mut line_count_hint,
mut buffer,
} = recycled_chunk;
if buffer.len() < carry_over.len() {
Expand Down Expand Up @@ -193,8 +213,21 @@ pub fn read<T: Read>(
parsed_floats,
line_num_floats,
};
parse_lines(read, &mut lines, &mut line_data, separator, settings);
Ok(ChunkContents { lines, line_data })
parse_lines(
read,
&mut lines,
&mut line_data,
&mut token_buffer,
&mut line_count_hint,
separator,
settings,
);
Ok(ChunkContents {
lines,
line_data,
token_buffer,
line_count_hint,
})
});
sender.send(payload?).unwrap();
}
Expand All @@ -206,6 +239,8 @@ fn parse_lines<'a>(
read: &'a [u8],
lines: &mut Vec<Line<'a>>,
line_data: &mut LineData<'a>,
token_buffer: &mut Vec<Range<usize>>,
line_count_hint: &mut usize,
separator: u8,
settings: &GlobalSettings,
) {
Expand All @@ -216,12 +251,55 @@ fn parse_lines<'a>(
assert!(line_data.num_infos.is_empty());
assert!(line_data.parsed_floats.is_empty());
assert!(line_data.line_num_floats.is_empty());
let mut token_buffer = vec![];
lines.extend(
read.split(|&c| c == separator)
.enumerate()
.map(|(index, line)| Line::create(line, index, line_data, &mut token_buffer, settings)),
);
token_buffer.clear();
if token_buffer.capacity() > MAX_TOKEN_BUFFER_ELEMS {
token_buffer.shrink_to(MAX_TOKEN_BUFFER_ELEMS);
}
const SMALL_CHUNK_BYTES: usize = 64 * 1024;
let mut estimated = (*line_count_hint).max(1);
let mut exact_line_count = None;
if *line_count_hint == 0 || read.len() <= SMALL_CHUNK_BYTES {
let count = if read.is_empty() {
1
} else {
memchr_iter(separator, read).count() + 1
};
exact_line_count = Some(count);
estimated = count;
} else if estimated == 1 {
const LINE_LEN_HINT: usize = 32;
estimated = (read.len() / LINE_LEN_HINT).max(1);
}
lines.reserve(estimated);
if settings.precomputed.selections_per_line > 0 {
line_data
.selections
.reserve(estimated.saturating_mul(settings.precomputed.selections_per_line));
}
if settings.precomputed.num_infos_per_line > 0 {
line_data
.num_infos
.reserve(estimated.saturating_mul(settings.precomputed.num_infos_per_line));
}
if settings.precomputed.floats_per_line > 0 {
line_data
.parsed_floats
.reserve(estimated.saturating_mul(settings.precomputed.floats_per_line));
}
if settings.mode == SortMode::Numeric {
line_data.line_num_floats.reserve(estimated);
}
let mut start = 0usize;
let mut index = 0usize;
for sep_idx in memchr_iter(separator, read) {
let line = &read[start..sep_idx];
lines.push(Line::create(line, index, line_data, token_buffer, settings));
index += 1;
start = sep_idx + 1;
}
let line = &read[start..];
lines.push(Line::create(line, index, line_data, token_buffer, settings));
*line_count_hint = exact_line_count.unwrap_or(index + 1);
}

/// Read from `file` into `buffer`.
Expand Down
15 changes: 10 additions & 5 deletions src/uu/sort/src/merge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ use uucore::error::{FromIo, UResult};
use crate::{
GlobalSettings, Output, SortError,
chunks::{self, Chunk, RecycledChunk},
compare_by, fd_soft_limit, open,
compare_by, current_open_fd_count, fd_soft_limit, open,
tmp_dir::TmpDirWrapper,
};

Expand Down Expand Up @@ -66,14 +66,19 @@ fn replace_output_file_in_input_files(
/// file-descriptor soft limit after reserving stdio/output and a safety margin.
fn effective_merge_batch_size(settings: &GlobalSettings) -> usize {
const MIN_BATCH_SIZE: usize = 2;
const RESERVED_STDIO: usize = 3;
const RESERVED_OUTPUT: usize = 1;
const RESERVED_TMP_OUTPUT: usize = 1;
const RESERVED_CTRL_C: usize = 2;
const RESERVED_RANDOM_SOURCE: usize = 1;
const SAFETY_MARGIN: usize = 1;
let mut batch_size = settings.merge_batch_size.max(MIN_BATCH_SIZE);

if let Some(limit) = fd_soft_limit() {
let reserved = RESERVED_STDIO + RESERVED_OUTPUT + SAFETY_MARGIN;
let available_inputs = limit.saturating_sub(reserved);
let open_fds = current_open_fd_count().unwrap_or(3);
let mut reserved = RESERVED_TMP_OUTPUT + RESERVED_CTRL_C + SAFETY_MARGIN;
if settings.salt.is_some() {
reserved = reserved.saturating_add(RESERVED_RANDOM_SOURCE);
}
let available_inputs = limit.saturating_sub(open_fds.saturating_add(reserved));
if available_inputs >= MIN_BATCH_SIZE {
batch_size = batch_size.min(available_inputs);
} else {
Expand Down
Loading
Loading