Skip to content

Commit bf3b7b3

Browse files
Add asterisks to the comparison for p-value
actual p-value still gated behind --pvalue
1 parent 33ea8ce commit bf3b7b3

File tree

4 files changed

+30
-7
lines changed

4 files changed

+30
-7
lines changed

lib/benchmark_runner.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ def build_output_text(ruby_descriptions, table, format, bench_failures)
6666
output_str << "- #{name} 1st itr: ratio of #{base_name}/#{name} time for the first benchmarking iteration.\n"
6767
output_str << "- #{base_name}/#{name}: ratio of #{base_name}/#{name} time. Higher is better for #{name}. Above 1 represents a speedup.\n"
6868
end
69+
output_str << "- ***: p < 0.001, **: p < 0.01, *: p < 0.05 (Welch's t-test)\n"
6970
end
7071

7172
output_str

lib/results_table_builder.rb

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def build_format
7070
end
7171

7272
@other_names.each do |_name|
73-
format << "%.3f"
73+
format << "%s"
7474
if @include_pvalue
7575
format << "%s" << "%s"
7676
end
@@ -115,15 +115,21 @@ def build_ratio_columns(row, base_t0, other_t0s, base_t, other_ts)
115115
row.concat(ratio_1sts)
116116

117117
other_ts.each do |other_t|
118-
row << mean(base_t) / mean(other_t)
118+
pval = Stats.welch_p_value(base_t, other_t)
119+
row << format_ratio(mean(base_t) / mean(other_t), pval)
119120
if @include_pvalue
120-
pval = Stats.welch_p_value(base_t, other_t)
121121
row << format_p_value(pval)
122122
row << significance_level(pval)
123123
end
124124
end
125125
end
126126

127+
def format_ratio(ratio, pval)
128+
sym = significance_symbol(pval)
129+
formatted = "%.3f" % ratio
130+
sym.empty? ? formatted : "#{formatted} (#{sym})"
131+
end
132+
127133
def format_p_value(pval)
128134
return "N/A" if pval.nil?
129135

@@ -134,6 +140,20 @@ def format_p_value(pval)
134140
end
135141
end
136142

143+
def significance_symbol(pval)
144+
return "" if pval.nil?
145+
146+
if pval < 0.001
147+
"***"
148+
elsif pval < 0.01
149+
"**"
150+
elsif pval < 0.05
151+
"*"
152+
else
153+
""
154+
end
155+
end
156+
137157
def significance_level(pval)
138158
return "" if pval.nil?
139159

test/benchmark_runner_test.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,7 @@
387387
assert_includes result, 'Legend:'
388388
assert_includes result, '- ruby-yjit 1st itr: ratio of ruby-base/ruby-yjit time for the first benchmarking iteration.'
389389
assert_includes result, '- ruby-base/ruby-yjit: ratio of ruby-base/ruby-yjit time. Higher is better for ruby-yjit. Above 1 represents a speedup.'
390+
assert_includes result, "- ***: p < 0.001, **: p < 0.01, *: p < 0.05 (Welch's t-test)"
390391
end
391392

392393
it 'includes formatted table in output' do

test/results_table_builder_test.rb

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,13 @@
5858

5959
assert_equal ['bench', 'ruby (ms)', 'stddev (%)', 'ruby-yjit (ms)', 'stddev (%)', 'ruby-yjit 1st itr', 'ruby/ruby-yjit'], table[0]
6060

61-
assert_equal ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f'], format
61+
assert_equal ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%s'], format
6262

6363
assert_equal 'fib', table[1][0]
6464
assert_in_delta 100.0, table[1][1], 1.0
6565
assert_in_delta 50.0, table[1][3], 1.0
6666
assert_in_delta 2.0, table[1][5], 0.1
67-
assert_in_delta 2.0, table[1][6], 0.1
67+
assert_match(/^2\.0\d+/, table[1][6])
6868
end
6969

7070
it 'includes RSS columns when include_rss is true' do
@@ -176,7 +176,7 @@
176176
]
177177
assert_equal expected_header, table[0]
178178

179-
expected_format = ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f', '%.3f', '%.3f']
179+
expected_format = ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f', '%s', '%s']
180180
assert_equal expected_format, format
181181
end
182182

@@ -370,7 +370,7 @@
370370
assert_equal '', table[1].last
371371
end
372372

373-
it 'omits p-value columns when include_pvalue is false' do
373+
it 'always shows significance symbol but omits verbose columns without --pvalue' do
374374
executable_names = ['ruby', 'ruby-yjit']
375375
bench_data = {
376376
'ruby' => {
@@ -397,6 +397,7 @@
397397
table, _format = builder.build
398398
refute_includes table[0], 'p-value'
399399
refute_includes table[0], 'sig'
400+
assert_match(/\(\*{1,3}\)$/, table[1].last)
400401
end
401402

402403
it 'handles only headline benchmarks' do

0 commit comments

Comments
 (0)