Skip to content

Commit c64ac86

Browse files
Merge pull request #479 from ruby/mvh-add-p-value
Calculate and add p-value and stddev to summary table
2 parents a5998ba + bf3b7b3 commit c64ac86

File tree

9 files changed

+353
-8
lines changed

9 files changed

+353
-8
lines changed

lib/argument_parser.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ class ArgumentParser
2323
:skip_yjit,
2424
:skip_zjit,
2525
:with_pre_init,
26+
:pvalue,
2627
keyword_init: true
2728
)
2829

@@ -144,6 +145,10 @@ def parse(argv)
144145
args.rss = true
145146
end
146147

148+
opts.on("--pvalue", "show p-value and significance columns for each comparison") do
149+
args.pvalue = true
150+
end
151+
147152
opts.on("--graph", "generate a graph image of benchmark results") do
148153
args.graph = true
149154
end
@@ -224,6 +229,7 @@ def default_args
224229
name_filters: [],
225230
excludes: [],
226231
rss: false,
232+
pvalue: false,
227233
graph: false,
228234
no_pinning: false,
229235
force_pinning: false,

lib/benchmark_runner.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ def build_output_text(ruby_descriptions, table, format, bench_failures)
6666
output_str << "- #{name} 1st itr: ratio of #{base_name}/#{name} time for the first benchmarking iteration.\n"
6767
output_str << "- #{base_name}/#{name}: ratio of #{base_name}/#{name} time. Higher is better for #{name}. Above 1 represents a speedup.\n"
6868
end
69+
output_str << "- ***: p < 0.001, **: p < 0.01, *: p < 0.05 (Welch's t-test)\n"
6970
end
7071

7172
output_str

lib/benchmark_runner/cli.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@ def run
6969
builder = ResultsTableBuilder.new(
7070
executable_names: ruby_descriptions.keys,
7171
bench_data: bench_data,
72-
include_rss: args.rss
72+
include_rss: args.rss,
73+
include_pvalue: args.pvalue
7374
)
7475
table, format = builder.build
7576

lib/results_table_builder.rb

Lines changed: 62 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@ class ResultsTableBuilder
55
SECONDS_TO_MS = 1000.0
66
BYTES_TO_MIB = 1024.0 * 1024.0
77

8-
def initialize(executable_names:, bench_data:, include_rss: false)
8+
def initialize(executable_names:, bench_data:, include_rss: false, include_pvalue: false)
99
@executable_names = executable_names
1010
@bench_data = bench_data
1111
@include_rss = include_rss
12+
@include_pvalue = include_pvalue
1213
@base_name = executable_names.first
1314
@other_names = executable_names[1..]
1415
@bench_names = compute_bench_names
@@ -48,6 +49,9 @@ def build_header
4849

4950
@other_names.each do |name|
5051
header << "#{@base_name}/#{name}"
52+
if @include_pvalue
53+
header << "p-value" << "sig"
54+
end
5155
end
5256

5357
header
@@ -66,7 +70,10 @@ def build_format
6670
end
6771

6872
@other_names.each do |_name|
69-
format << "%.3f"
73+
format << "%s"
74+
if @include_pvalue
75+
format << "%s" << "%s"
76+
end
7077
end
7178

7279
format
@@ -105,9 +112,60 @@ def build_comparison_columns(row, other_ts, other_rsss)
105112

106113
def build_ratio_columns(row, base_t0, other_t0s, base_t, other_ts)
107114
ratio_1sts = other_t0s.map { |other_t0| base_t0 / other_t0 }
108-
ratios = other_ts.map { |other_t| mean(base_t) / mean(other_t) }
109115
row.concat(ratio_1sts)
110-
row.concat(ratios)
116+
117+
other_ts.each do |other_t|
118+
pval = Stats.welch_p_value(base_t, other_t)
119+
row << format_ratio(mean(base_t) / mean(other_t), pval)
120+
if @include_pvalue
121+
row << format_p_value(pval)
122+
row << significance_level(pval)
123+
end
124+
end
125+
end
126+
127+
def format_ratio(ratio, pval)
128+
sym = significance_symbol(pval)
129+
formatted = "%.3f" % ratio
130+
sym.empty? ? formatted : "#{formatted} (#{sym})"
131+
end
132+
133+
def format_p_value(pval)
134+
return "N/A" if pval.nil?
135+
136+
if pval >= 0.001
137+
"%.3f" % pval
138+
else
139+
"%.1e" % pval
140+
end
141+
end
142+
143+
def significance_symbol(pval)
144+
return "" if pval.nil?
145+
146+
if pval < 0.001
147+
"***"
148+
elsif pval < 0.01
149+
"**"
150+
elsif pval < 0.05
151+
"*"
152+
else
153+
""
154+
end
155+
end
156+
157+
def significance_level(pval)
158+
return "" if pval.nil?
159+
160+
if pval < 0.001
161+
"p < 0.001"
162+
elsif pval < 0.01
163+
"p < 0.01"
164+
elsif pval < 0.05
165+
"p < 0.05"
166+
else
167+
""
168+
end
111169
end
112170

113171
def extract_first_iteration_times(bench_name)

misc/stats.rb

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,101 @@
11
class Stats
2+
class << self
3+
# Welch's t-test (two-tailed). Returns the p-value, or nil if
4+
# either sample is too small to compute a meaningful test.
5+
def welch_p_value(a, b)
6+
return nil if a.size < 2 || b.size < 2
7+
8+
stats_a = new(a)
9+
stats_b = new(b)
10+
11+
n_a = a.size.to_f
12+
n_b = b.size.to_f
13+
var_a = stats_a.sample_variance
14+
var_b = stats_b.sample_variance
15+
16+
se_sq = var_a / n_a + var_b / n_b
17+
if se_sq == 0.0
18+
# Both samples have zero variance — if means match they're
19+
# indistinguishable, otherwise they're trivially different.
20+
return stats_a.mean == stats_b.mean ? 1.0 : 0.0
21+
end
22+
23+
t = (stats_a.mean - stats_b.mean) / Math.sqrt(se_sq)
24+
25+
# Welch-Satterthwaite degrees of freedom
26+
df = se_sq ** 2 / ((var_a / n_a) ** 2 / (n_a - 1) + (var_b / n_b) ** 2 / (n_b - 1))
27+
28+
# Two-tailed p-value: I_x(df/2, 1/2) where x = df/(df + t^2)
29+
x = df / (df + t * t)
30+
regularized_incomplete_beta(x, df / 2.0, 0.5)
31+
end
32+
33+
private
34+
35+
# Regularized incomplete beta function I_x(alpha, beta) via continued fraction (Lentz's method).
36+
# Returns the probability that a Beta(alpha, beta)-distributed variable is <= x.
37+
def regularized_incomplete_beta(x, alpha, beta)
38+
return 0.0 if x <= 0.0
39+
return 1.0 if x >= 1.0
40+
41+
# Symmetry relation: pick the side that converges faster
42+
if x > (alpha + 1.0) / (alpha + beta + 2.0)
43+
return 1.0 - regularized_incomplete_beta(1.0 - x, beta, alpha)
44+
end
45+
46+
# B(alpha, beta) * x^alpha * (1-x)^beta — computed in log-space to avoid overflow
47+
ln_normalizer = Math.lgamma(alpha + beta)[0] - Math.lgamma(alpha)[0] - Math.lgamma(beta)[0] +
48+
alpha * Math.log(x) + beta * Math.log(1.0 - x)
49+
normalizer = Math.exp(ln_normalizer)
50+
51+
normalizer * beta_continued_fraction(x, alpha, beta) / alpha
52+
end
53+
54+
# Evaluates the continued fraction for I_x(alpha, beta) using Lentz's algorithm.
55+
# Each iteration computes two sub-steps (even and odd terms of the fraction).
56+
def beta_continued_fraction(x, alpha, beta)
57+
floor = 1.0e-30 # prevent division by zero in Lentz's method
58+
converged = false
59+
60+
numerator_term = 1.0
61+
denominator_term = 1.0 - (alpha + beta) * x / (alpha + 1.0)
62+
denominator_term = floor if denominator_term.abs < floor
63+
denominator_term = 1.0 / denominator_term
64+
fraction = denominator_term
65+
66+
(1..200).each do |iteration|
67+
two_i = 2 * iteration
68+
69+
# Even sub-step: d_{2m} coefficient of the continued fraction
70+
coeff = iteration * (beta - iteration) * x / ((alpha + two_i - 1.0) * (alpha + two_i))
71+
denominator_term = 1.0 + coeff * denominator_term
72+
denominator_term = floor if denominator_term.abs < floor
73+
numerator_term = 1.0 + coeff / numerator_term
74+
numerator_term = floor if numerator_term.abs < floor
75+
denominator_term = 1.0 / denominator_term
76+
fraction *= denominator_term * numerator_term
77+
78+
# Odd sub-step: d_{2m+1} coefficient of the continued fraction
79+
coeff = -(alpha + iteration) * (alpha + beta + iteration) * x / ((alpha + two_i) * (alpha + two_i + 1.0))
80+
denominator_term = 1.0 + coeff * denominator_term
81+
denominator_term = floor if denominator_term.abs < floor
82+
numerator_term = 1.0 + coeff / numerator_term
83+
numerator_term = floor if numerator_term.abs < floor
84+
denominator_term = 1.0 / denominator_term
85+
correction = denominator_term * numerator_term
86+
fraction *= correction
87+
88+
if (correction - 1.0).abs < 1.0e-10
89+
converged = true
90+
break
91+
end
92+
end
93+
94+
warn "Stats.beta_continued_fraction: did not converge (alpha=#{alpha}, beta=#{beta}, x=#{x})" unless converged
95+
fraction
96+
end
97+
end
98+
299
def initialize(data)
3100
@data = data
4101
end
@@ -15,13 +112,20 @@ def mean
15112
@data.sum(0.0) / @data.size
16113
end
17114

115+
# Population standard deviation (N denominator) — describes these specific values.
18116
def stddev
19117
mean = self.mean
20118
diffs_squared = @data.map { |v| (v-mean) * (v-mean) }
21119
mean_squared = diffs_squared.sum(0.0) / @data.size
22120
Math.sqrt(mean_squared)
23121
end
24122

123+
# Unbiased sample variance (N-1 denominator, Bessel's correction) — for inference.
124+
def sample_variance
125+
m = mean
126+
@data.sum { |v| (v - m) ** 2 } / (@data.size - 1).to_f
127+
end
128+
25129
def median
26130
compute_median(@data)
27131
end

test/argument_parser_test.rb

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ def setup_mock_ruby(path)
4949
assert_equal [], args.categories
5050
assert_equal [], args.name_filters
5151
assert_equal false, args.rss
52+
assert_equal false, args.pvalue
5253
assert_equal false, args.graph
5354
assert_equal false, args.no_pinning
5455
assert_equal false, args.turbo
@@ -428,6 +429,15 @@ def setup_mock_ruby(path)
428429
end
429430
end
430431

432+
describe '--pvalue option' do
433+
it 'sets pvalue flag' do
434+
parser = ArgumentParser.new
435+
args = parser.parse(['--pvalue'])
436+
437+
assert_equal true, args.pvalue
438+
end
439+
end
440+
431441
describe '--graph option' do
432442
it 'sets graph flag' do
433443
parser = ArgumentParser.new

test/benchmark_runner_test.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,7 @@
387387
assert_includes result, 'Legend:'
388388
assert_includes result, '- ruby-yjit 1st itr: ratio of ruby-base/ruby-yjit time for the first benchmarking iteration.'
389389
assert_includes result, '- ruby-base/ruby-yjit: ratio of ruby-base/ruby-yjit time. Higher is better for ruby-yjit. Above 1 represents a speedup.'
390+
assert_includes result, "- ***: p < 0.001, **: p < 0.01, *: p < 0.05 (Welch's t-test)"
390391
end
391392

392393
it 'includes formatted table in output' do

0 commit comments

Comments
 (0)