add floyd-rivest selection and parallelization to gmedian #7481
+122
−22
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.

Not directly demanded but improving benchmarks
Details
```r library(atime) library(data.table)pkg.path = '.'
limit = 5
taken from .ci/atime/tests.R
pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) {
pkg_find_replace = function(glob, FIND, REPLACE) {
atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE)
}
Package_regex = gsub(".", "?", old.Package, fixed = TRUE)
Package = gsub(".", "", old.Package, fixed = TRUE)
new.Package = paste0(Package_, "", sha)
pkg_find_replace(
"DESCRIPTION",
paste0("Package:\s+", old.Package),
paste("Package:", new.Package))
pkg_find_replace(
file.path("src", "Makevars.*in"),
Package_regex,
new.Package)
pkg_find_replace(
file.path("R", "onLoad.R"),
Package_regex,
new.Package_)
pkg_find_replace(
file.path("R", "onLoad.R"),
sprintf('packageVersion\("%s"\)', old.Package),
sprintf('packageVersion\("%s"\)', new.Package))
pkg_find_replace(
file.path("src", "init.c"),
paste0("R_init_", Package_regex),
paste0("R_init_", gsub("[.]", "", new.Package)))
pkg_find_replace(
"NAMESPACE",
sprintf('useDynLib\("?%s"?', Package_regex),
paste0('useDynLib(', new.Package_))
}
versions = c(
'before' = 'b0c4ac3b',
'after' = 'df05b9d0'
)
set.seed(123)
===== Benchmark 1: Small groups (10 elements each) =====
N = as.integer(10^seq(2, 6, 0.25))
grpsize_small = 10
gmedian_work1 = lapply(setNames(nm = N), function(n) {
data.table(
grp = rep(1:n, each = grpsize_small),
x = rnorm(n * grpsize_small)
)
})
gmedian_bench1 = atime_versions(
pkg.path, N,
setup = {
dt = gmedian_work1[[as.character(N)]]
},
expr = data.table:::
[.data.table(dt, , .(med = median(x)), by = grp),seconds.limit = limit, verbose = TRUE, sha.vec = versions,
pkg.edit.fun = pkg.edit.fun
)
plot(gmedian_bench1)
===== Benchmark 2: Medium groups (100 elements each) =====
N = as.integer(10^seq(2, 5, 0.25))
grpsize_medium = 100
gmedian_work2 = lapply(setNames(nm = N), function(n) {
data.table(
grp = rep(1:n, each = grpsize_medium),
x = rnorm(n * grpsize_medium)
)
})
gmedian_bench2 = atime_versions(
pkg.path, N,
setup = {
dt = gmedian_work2[[as.character(N)]]
},
expr = data.table:::
[.data.table(dt, , .(med = median(x)), by = grp),seconds.limit = limit, verbose = TRUE, sha.vec = versions,
pkg.edit.fun = pkg.edit.fun
)
plot(gmedian_bench2)
===== Benchmark 3: Large groups (1000 elements each) =====
N = as.integer(10^seq(2, 4.5, 0.25))
grpsize_large = 1000
gmedian_work3 = lapply(setNames(nm = N), function(n) {
data.table(
grp = rep(1:n, each = grpsize_large),
x = rnorm(n * grpsize_large)
)
})
gmedian_bench3 = atime_versions(
pkg.path, N,
setup = {
dt = gmedian_work3[[as.character(N)]]
},
expr = data.table:::
[.data.table(dt, , .(med = median(x)), by = grp),seconds.limit = limit, verbose = TRUE, sha.vec = versions,
pkg.edit.fun = pkg.edit.fun
)
plot(gmedian_bench3)
===== Benchmark 4: Very large groups (10000 elements each) =====
N = as.integer(10^seq(1, 3.5, 0.25))
grpsize_vlarge = 10000
gmedian_work4 = lapply(setNames(nm = N), function(n) {
data.table(
grp = rep(1:n, each = grpsize_vlarge),
x = rnorm(n * grpsize_vlarge)
)
})
gmedian_bench4 = atime_versions(
pkg.path, N,
setup = {
dt = gmedian_work4[[as.character(N)]]
},
expr = data.table:::
[.data.table(dt, , .(med = median(x)), by = grp),seconds.limit = limit, verbose = TRUE, sha.vec = versions,
pkg.edit.fun = pkg.edit.fun
)
plot(gmedian_bench4)
===== Benchmark 5: Integer data =====
N = as.integer(10^seq(2, 4.5, 0.25))
grpsize_int = 1000
gmedian_work5 = lapply(setNames(nm = N), function(n) {
data.table(
grp = rep(1:n, each = grpsize_int),
x = sample(1L:1000L, n * grpsize_int, replace = TRUE)
)
})
gmedian_bench5 = atime_versions(
pkg.path, N,
setup = {
dt = gmedian_work5[[as.character(N)]]
},
expr = data.table:::
[.data.table(dt, , .(med = median(x)), by = grp),seconds.limit = limit, verbose = TRUE, sha.vec = versions,
pkg.edit.fun = pkg.edit.fun
)
plot(gmedian_bench5)
===== Benchmark 6: With NA values =====
N = as.integer(10^seq(2, 4.5, 0.25))
grpsize_na = 1000
gmedian_work6 = lapply(setNames(nm = N), function(n) {
dt = data.table(
grp = rep(1:n, each = grpsize_na),
x = rnorm(n * grpsize_na)
)
Set 10% of values to NA
dt[sample(.N, as.integer(.N * 0.1)), x := NA_real_]
dt
})
gmedian_bench6 = atime_versions(
pkg.path, N,
setup = {
dt = gmedian_work6[[as.character(N)]]
},
expr = data.table:::
[.data.table(dt, , .(med = median(x, na.rm = TRUE)), by = grp),seconds.limit = limit, verbose = TRUE, sha.vec = versions,
pkg.edit.fun = pkg.edit.fun
)
plot(gmedian_bench6)
===== SAVE BENCHMARK RESULTS =====
results_dir = "benchmark_results"
dir.create(results_dir, showWarnings = FALSE)
save(
gmedian_bench1, gmedian_bench2, gmedian_bench3,
gmedian_bench4, gmedian_bench5, gmedian_bench6,
file = file.path(results_dir, "benchmarks_gmedian.rda")
)
library(ggplot2)
library(patchwork)
load(file.path(results_dir, "benchmarks_gmedian.rda"))
g = function(p, title) p + ggtitle(title)
p = list(
g(plot(gmedian_bench1), "Small groups (10 elements)"),
g(plot(gmedian_bench2), "Medium groups (100 elements)"),
g(plot(gmedian_bench3), "Large groups (1000 elements)"),
g(plot(gmedian_bench4), "Very large groups (10000 elements)"),
g(plot(gmedian_bench5), "Integer data (1000 elements)"),
g(plot(gmedian_bench6), "With 10% NA values (1000 elements)")
)
pdf("benchmark_gmedian_results.pdf", width = 12, height = 9)
for (i in seq(1, length(p), by = 4)) {
pg = wrap_plots(p[i:min(i+3, length(p))], ncol = 2, nrow = 2, guides = "collect")
print(pg)
}
dev.off()