Skip to content

Commit 7bd2f2d

Browse files
Calculate and add p-value and stddev to summary table
Using Welch's t-test for p-values because I don't want to assume that both versions of Ruby tested are going to have equal timing distributions.
1 parent a5998ba commit 7bd2f2d

File tree

4 files changed

+278
-9
lines changed

4 files changed

+278
-9
lines changed

lib/results_table_builder.rb

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def build_header
4747
end
4848

4949
@other_names.each do |name|
50-
header << "#{@base_name}/#{name}"
50+
header << "#{@base_name}/#{name}" << "p-value" << "sig"
5151
end
5252

5353
header
@@ -66,7 +66,7 @@ def build_format
6666
end
6767

6868
@other_names.each do |_name|
69-
format << "%.3f"
69+
format << "%.3f" << "%s" << "%s"
7070
end
7171

7272
format
@@ -105,9 +105,38 @@ def build_comparison_columns(row, other_ts, other_rsss)
105105

106106
def build_ratio_columns(row, base_t0, other_t0s, base_t, other_ts)
107107
ratio_1sts = other_t0s.map { |other_t0| base_t0 / other_t0 }
108-
ratios = other_ts.map { |other_t| mean(base_t) / mean(other_t) }
109108
row.concat(ratio_1sts)
110-
row.concat(ratios)
109+
110+
other_ts.each do |other_t|
111+
pval = Stats.welch_p_value(base_t, other_t)
112+
row << mean(base_t) / mean(other_t)
113+
row << format_p_value(pval)
114+
row << significance_level(pval)
115+
end
116+
end
117+
118+
def format_p_value(pval)
119+
return "N/A" if pval.nil?
120+
121+
if pval >= 0.001
122+
"%.3f" % pval
123+
else
124+
"%.1e" % pval
125+
end
126+
end
127+
128+
def significance_level(pval)
129+
return "" if pval.nil?
130+
131+
if pval < 0.001
132+
"p < 0.001"
133+
elsif pval < 0.01
134+
"p < 0.01"
135+
elsif pval < 0.05
136+
"p < 0.05"
137+
else
138+
""
139+
end
111140
end
112141

113142
def extract_first_iteration_times(bench_name)

misc/stats.rb

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,101 @@
11
class Stats
2+
class << self
3+
# Welch's t-test (two-tailed). Returns the p-value, or nil if
4+
# either sample is too small to compute a meaningful test.
5+
def welch_p_value(a, b)
6+
return nil if a.size < 2 || b.size < 2
7+
8+
stats_a = new(a)
9+
stats_b = new(b)
10+
11+
n_a = a.size.to_f
12+
n_b = b.size.to_f
13+
var_a = stats_a.sample_variance
14+
var_b = stats_b.sample_variance
15+
16+
se_sq = var_a / n_a + var_b / n_b
17+
if se_sq == 0.0
18+
# Both samples have zero variance — if means match they're
19+
# indistinguishable, otherwise they're trivially different.
20+
return stats_a.mean == stats_b.mean ? 1.0 : 0.0
21+
end
22+
23+
t = (stats_a.mean - stats_b.mean) / Math.sqrt(se_sq)
24+
25+
# Welch-Satterthwaite degrees of freedom
26+
df = se_sq ** 2 / ((var_a / n_a) ** 2 / (n_a - 1) + (var_b / n_b) ** 2 / (n_b - 1))
27+
28+
# Two-tailed p-value: I_x(df/2, 1/2) where x = df/(df + t^2)
29+
x = df / (df + t * t)
30+
regularized_incomplete_beta(x, df / 2.0, 0.5)
31+
end
32+
33+
private
34+
35+
# Regularized incomplete beta function I_x(alpha, beta) via continued fraction (Lentz's method).
36+
# Returns the probability that a Beta(alpha, beta)-distributed variable is <= x.
37+
def regularized_incomplete_beta(x, alpha, beta)
38+
return 0.0 if x <= 0.0
39+
return 1.0 if x >= 1.0
40+
41+
# Symmetry relation: pick the side that converges faster
42+
if x > (alpha + 1.0) / (alpha + beta + 2.0)
43+
return 1.0 - regularized_incomplete_beta(1.0 - x, beta, alpha)
44+
end
45+
46+
# B(alpha, beta) * x^alpha * (1-x)^beta — computed in log-space to avoid overflow
47+
ln_normalizer = Math.lgamma(alpha + beta)[0] - Math.lgamma(alpha)[0] - Math.lgamma(beta)[0] +
48+
alpha * Math.log(x) + beta * Math.log(1.0 - x)
49+
normalizer = Math.exp(ln_normalizer)
50+
51+
normalizer * beta_continued_fraction(x, alpha, beta) / alpha
52+
end
53+
54+
# Evaluates the continued fraction for I_x(alpha, beta) using Lentz's algorithm.
55+
# Each iteration computes two sub-steps (even and odd terms of the fraction).
56+
def beta_continued_fraction(x, alpha, beta)
57+
floor = 1.0e-30 # prevent division by zero in Lentz's method
58+
converged = false
59+
60+
numerator_term = 1.0
61+
denominator_term = 1.0 - (alpha + beta) * x / (alpha + 1.0)
62+
denominator_term = floor if denominator_term.abs < floor
63+
denominator_term = 1.0 / denominator_term
64+
fraction = denominator_term
65+
66+
(1..200).each do |iteration|
67+
two_i = 2 * iteration
68+
69+
# Even sub-step: d_{2m} coefficient of the continued fraction
70+
coeff = iteration * (beta - iteration) * x / ((alpha + two_i - 1.0) * (alpha + two_i))
71+
denominator_term = 1.0 + coeff * denominator_term
72+
denominator_term = floor if denominator_term.abs < floor
73+
numerator_term = 1.0 + coeff / numerator_term
74+
numerator_term = floor if numerator_term.abs < floor
75+
denominator_term = 1.0 / denominator_term
76+
fraction *= denominator_term * numerator_term
77+
78+
# Odd sub-step: d_{2m+1} coefficient of the continued fraction
79+
coeff = -(alpha + iteration) * (alpha + beta + iteration) * x / ((alpha + two_i) * (alpha + two_i + 1.0))
80+
denominator_term = 1.0 + coeff * denominator_term
81+
denominator_term = floor if denominator_term.abs < floor
82+
numerator_term = 1.0 + coeff / numerator_term
83+
numerator_term = floor if numerator_term.abs < floor
84+
denominator_term = 1.0 / denominator_term
85+
correction = denominator_term * numerator_term
86+
fraction *= correction
87+
88+
if (correction - 1.0).abs < 1.0e-10
89+
converged = true
90+
break
91+
end
92+
end
93+
94+
warn "Stats.beta_continued_fraction: did not converge (alpha=#{alpha}, beta=#{beta}, x=#{x})" unless converged
95+
fraction
96+
end
97+
end
98+
299
def initialize(data)
3100
@data = data
4101
end
@@ -15,13 +112,20 @@ def mean
15112
@data.sum(0.0) / @data.size
16113
end
17114

115+
# Population standard deviation (N denominator) — describes these specific values.
18116
def stddev
19117
mean = self.mean
20118
diffs_squared = @data.map { |v| (v-mean) * (v-mean) }
21119
mean_squared = diffs_squared.sum(0.0) / @data.size
22120
Math.sqrt(mean_squared)
23121
end
24122

123+
# Unbiased sample variance (N-1 denominator, Bessel's correction) — for inference.
124+
def sample_variance
125+
m = mean
126+
@data.sum { |v| (v - m) ** 2 } / (@data.size - 1).to_f
127+
end
128+
25129
def median
26130
compute_median(@data)
27131
end

test/results_table_builder_test.rb

Lines changed: 69 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,15 +56,17 @@
5656

5757
table, format = builder.build
5858

59-
assert_equal ['bench', 'ruby (ms)', 'stddev (%)', 'ruby-yjit (ms)', 'stddev (%)', 'ruby-yjit 1st itr', 'ruby/ruby-yjit'], table[0]
59+
assert_equal ['bench', 'ruby (ms)', 'stddev (%)', 'ruby-yjit (ms)', 'stddev (%)', 'ruby-yjit 1st itr', 'ruby/ruby-yjit', 'p-value', 'sig'], table[0]
6060

61-
assert_equal ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f'], format
61+
assert_equal ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f', '%s', '%s'], format
6262

6363
assert_equal 'fib', table[1][0]
6464
assert_in_delta 100.0, table[1][1], 1.0
6565
assert_in_delta 50.0, table[1][3], 1.0
6666
assert_in_delta 2.0, table[1][5], 0.1
6767
assert_in_delta 2.0, table[1][6], 0.1
68+
assert_instance_of String, table[1][7]
69+
assert_instance_of String, table[1][8]
6870
end
6971

7072
it 'includes RSS columns when include_rss is true' do
@@ -171,12 +173,12 @@
171173
'ruby-rjit (ms)', 'stddev (%)',
172174
'ruby-yjit 1st itr',
173175
'ruby-rjit 1st itr',
174-
'ruby/ruby-yjit',
175-
'ruby/ruby-rjit'
176+
'ruby/ruby-yjit', 'p-value', 'sig',
177+
'ruby/ruby-rjit', 'p-value', 'sig'
176178
]
177179
assert_equal expected_header, table[0]
178180

179-
expected_format = ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f', '%.3f', '%.3f']
181+
expected_format = ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f', '%.3f', '%s', '%s', '%.3f', '%s', '%s']
180182
assert_equal expected_format, format
181183
end
182184

@@ -308,6 +310,68 @@
308310
assert_equal 'fib', table[1][0]
309311
end
310312

313+
it 'shows small p-value in scientific notation for clearly different distributions' do
314+
executable_names = ['ruby', 'ruby-yjit']
315+
bench_data = {
316+
'ruby' => {
317+
'fib' => {
318+
'warmup' => [0.1],
319+
'bench' => [0.100, 0.101, 0.099, 0.1005, 0.0995, 0.1002, 0.0998, 0.1001, 0.0999, 0.1003],
320+
'rss' => 1024 * 1024 * 10
321+
}
322+
},
323+
'ruby-yjit' => {
324+
'fib' => {
325+
'warmup' => [0.05],
326+
'bench' => [0.050, 0.051, 0.049, 0.0505, 0.0495, 0.0502, 0.0498, 0.0501, 0.0499, 0.0503],
327+
'rss' => 1024 * 1024 * 12
328+
}
329+
}
330+
}
331+
332+
builder = ResultsTableBuilder.new(
333+
executable_names: executable_names,
334+
bench_data: bench_data,
335+
include_rss: false
336+
)
337+
338+
table, _format = builder.build
339+
p_value_str = table[1][-2]
340+
sig_str = table[1].last
341+
assert_match(/e-/, p_value_str, "Expected scientific notation for very small p-value, got #{p_value_str}")
342+
assert_equal "p < 0.001", sig_str
343+
end
344+
345+
it 'shows N/A p-value when samples have fewer than 2 elements' do
346+
executable_names = ['ruby', 'ruby-yjit']
347+
bench_data = {
348+
'ruby' => {
349+
'fib' => {
350+
'warmup' => [0.1],
351+
'bench' => [0.1],
352+
'rss' => 1024 * 1024 * 10
353+
}
354+
},
355+
'ruby-yjit' => {
356+
'fib' => {
357+
'warmup' => [0.05],
358+
'bench' => [0.05],
359+
'rss' => 1024 * 1024 * 12
360+
}
361+
}
362+
}
363+
364+
builder = ResultsTableBuilder.new(
365+
executable_names: executable_names,
366+
bench_data: bench_data,
367+
include_rss: false
368+
)
369+
370+
table, _format = builder.build
371+
assert_equal 'N/A', table[1][-2]
372+
assert_equal '', table[1].last
373+
end
374+
311375
it 'handles only headline benchmarks' do
312376
executable_names = ['ruby']
313377
bench_data = {

test/stats_test.rb

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
require_relative 'test_helper'
2+
require_relative '../misc/stats'
3+
4+
describe Stats do
5+
describe '#sample_variance' do
6+
it 'computes unbiased sample variance (n-1 denominator)' do
7+
stats = Stats.new([2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0])
8+
assert_in_delta 4.571, stats.sample_variance, 0.001
9+
end
10+
11+
it 'equals population variance * n/(n-1)' do
12+
data = [10.0, 12.0, 14.0, 16.0, 18.0]
13+
stats = Stats.new(data)
14+
n = data.size.to_f
15+
assert_in_delta stats.stddev ** 2 * n / (n - 1), stats.sample_variance, 1e-10
16+
end
17+
end
18+
19+
describe '.welch_p_value' do
20+
it 'returns 1.0 for identical distributions' do
21+
a = [1.0, 1.0, 1.0, 1.0, 1.0]
22+
b = [1.0, 1.0, 1.0, 1.0, 1.0]
23+
assert_in_delta 1.0, Stats.welch_p_value(a, b), 0.001
24+
end
25+
26+
it 'returns small p-value for clearly different distributions' do
27+
a = [100.0, 101.0, 99.0, 100.5, 99.5, 100.2, 99.8, 100.1, 99.9, 100.3]
28+
b = [50.0, 51.0, 49.0, 50.5, 49.5, 50.2, 49.8, 50.1, 49.9, 50.3]
29+
p_value = Stats.welch_p_value(a, b)
30+
assert p_value < 0.001, "Expected p < 0.001, got #{p_value}"
31+
end
32+
33+
it 'returns high p-value for overlapping distributions' do
34+
a = [10.0, 11.0, 9.0, 10.5, 9.5]
35+
b = [10.1, 10.9, 9.1, 10.6, 9.4]
36+
p_value = Stats.welch_p_value(a, b)
37+
assert p_value > 0.5, "Expected p > 0.5, got #{p_value}"
38+
end
39+
40+
it 'is symmetric' do
41+
a = [100.0, 102.0, 98.0, 101.0, 99.0]
42+
b = [90.0, 92.0, 88.0, 91.0, 89.0]
43+
assert_in_delta Stats.welch_p_value(a, b), Stats.welch_p_value(b, a), 1e-10
44+
end
45+
46+
it 'handles different sample sizes' do
47+
a = [100.0, 101.0, 99.0]
48+
b = [50.0, 51.0, 49.0, 50.5, 49.5, 50.2, 49.8, 50.1, 49.9, 50.3]
49+
p_value = Stats.welch_p_value(a, b)
50+
assert p_value < 0.001, "Expected p < 0.001, got #{p_value}"
51+
end
52+
53+
it 'handles unequal variances' do
54+
a = [100.0, 100.0, 100.0, 100.0, 100.0] # zero variance
55+
b = [50.0, 60.0, 40.0, 70.0, 30.0] # high variance
56+
p_value = Stats.welch_p_value(a, b)
57+
assert p_value < 0.05, "Expected p < 0.05, got #{p_value}"
58+
end
59+
60+
it 'returns 0.0 when both samples have zero variance but different means' do
61+
a = [10.0, 10.0, 10.0, 10.0, 10.0]
62+
b = [5.0, 5.0, 5.0, 5.0, 5.0]
63+
assert_equal 0.0, Stats.welch_p_value(a, b)
64+
end
65+
66+
it 'returns nil when a sample has fewer than 2 elements' do
67+
assert_nil Stats.welch_p_value([10.0], [5.0, 6.0, 7.0])
68+
assert_nil Stats.welch_p_value([5.0, 6.0, 7.0], [10.0])
69+
assert_nil Stats.welch_p_value([10.0], [5.0])
70+
end
71+
end
72+
end

0 commit comments

Comments
 (0)