Gate P-value calculations behind --pvalue

eightbitraptor · eightbitraptor · commit 33ea8ce966b4 · 2026-02-18T16:48:42.000Z
diff --git a/lib/argument_parser.rb b/lib/argument_parser.rb
@@ -23,6 +23,7 @@ class ArgumentParser
     :skip_yjit,
     :skip_zjit,
     :with_pre_init,
+    :pvalue,
     keyword_init: true
   )
 
@@ -144,6 +145,10 @@ def parse(argv)
         args.rss = true
       end
 
+      opts.on("--pvalue", "show p-value and significance columns for each comparison") do
+        args.pvalue = true
+      end
+
       opts.on("--graph", "generate a graph image of benchmark results") do
         args.graph = true
       end
@@ -224,6 +229,7 @@ def default_args
       name_filters: [],
       excludes: [],
       rss: false,
+      pvalue: false,
       graph: false,
       no_pinning: false,
       force_pinning: false,
diff --git a/lib/benchmark_runner/cli.rb b/lib/benchmark_runner/cli.rb
@@ -69,7 +69,8 @@ def run
       builder = ResultsTableBuilder.new(
         executable_names: ruby_descriptions.keys,
         bench_data: bench_data,
-        include_rss: args.rss
+        include_rss: args.rss,
+        include_pvalue: args.pvalue
       )
       table, format = builder.build
 
diff --git a/lib/results_table_builder.rb b/lib/results_table_builder.rb
@@ -5,10 +5,11 @@ class ResultsTableBuilder
   SECONDS_TO_MS = 1000.0
   BYTES_TO_MIB = 1024.0 * 1024.0
 
-  def initialize(executable_names:, bench_data:, include_rss: false)
+  def initialize(executable_names:, bench_data:, include_rss: false, include_pvalue: false)
     @executable_names = executable_names
     @bench_data = bench_data
     @include_rss = include_rss
+    @include_pvalue = include_pvalue
     @base_name = executable_names.first
     @other_names = executable_names[1..]
     @bench_names = compute_bench_names
@@ -47,7 +48,10 @@ def build_header
     end
 
     @other_names.each do |name|
-      header << "#{@base_name}/#{name}" << "p-value" << "sig"
+      header << "#{@base_name}/#{name}"
+      if @include_pvalue
+        header << "p-value" << "sig"
+      end
     end
 
     header
@@ -66,7 +70,10 @@ def build_format
     end
 
     @other_names.each do |_name|
-      format << "%.3f" << "%s" << "%s"
+      format << "%.3f"
+      if @include_pvalue
+        format << "%s" << "%s"
+      end
     end
 
     format
@@ -108,10 +115,12 @@ def build_ratio_columns(row, base_t0, other_t0s, base_t, other_ts)
     row.concat(ratio_1sts)
 
     other_ts.each do |other_t|
-      pval = Stats.welch_p_value(base_t, other_t)
       row << mean(base_t) / mean(other_t)
-      row << format_p_value(pval)
-      row << significance_level(pval)
+      if @include_pvalue
+        pval = Stats.welch_p_value(base_t, other_t)
+        row << format_p_value(pval)
+        row << significance_level(pval)
+      end
     end
   end
 
diff --git a/test/argument_parser_test.rb b/test/argument_parser_test.rb
@@ -49,6 +49,7 @@ def setup_mock_ruby(path)
         assert_equal [], args.categories
         assert_equal [], args.name_filters
         assert_equal false, args.rss
+        assert_equal false, args.pvalue
         assert_equal false, args.graph
         assert_equal false, args.no_pinning
         assert_equal false, args.turbo
@@ -428,6 +429,15 @@ def setup_mock_ruby(path)
       end
     end
 
+    describe '--pvalue option' do
+      it 'sets pvalue flag' do
+        parser = ArgumentParser.new
+        args = parser.parse(['--pvalue'])
+
+        assert_equal true, args.pvalue
+      end
+    end
+
     describe '--graph option' do
       it 'sets graph flag' do
         parser = ArgumentParser.new
diff --git a/test/results_table_builder_test.rb b/test/results_table_builder_test.rb
@@ -56,17 +56,15 @@
 
       table, format = builder.build
 
-      assert_equal ['bench', 'ruby (ms)', 'stddev (%)', 'ruby-yjit (ms)', 'stddev (%)', 'ruby-yjit 1st itr', 'ruby/ruby-yjit', 'p-value', 'sig'], table[0]
+      assert_equal ['bench', 'ruby (ms)', 'stddev (%)', 'ruby-yjit (ms)', 'stddev (%)', 'ruby-yjit 1st itr', 'ruby/ruby-yjit'], table[0]
 
-      assert_equal ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f', '%s', '%s'], format
+      assert_equal ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f'], format
 
       assert_equal 'fib', table[1][0]
       assert_in_delta 100.0, table[1][1], 1.0
       assert_in_delta 50.0, table[1][3], 1.0
       assert_in_delta 2.0, table[1][5], 0.1
       assert_in_delta 2.0, table[1][6], 0.1
-      assert_instance_of String, table[1][7]
-      assert_instance_of String, table[1][8]
     end
 
     it 'includes RSS columns when include_rss is true' do
@@ -173,12 +171,12 @@
         'ruby-rjit (ms)', 'stddev (%)',
         'ruby-yjit 1st itr',
         'ruby-rjit 1st itr',
-        'ruby/ruby-yjit', 'p-value', 'sig',
-        'ruby/ruby-rjit', 'p-value', 'sig'
+        'ruby/ruby-yjit',
+        'ruby/ruby-rjit'
       ]
       assert_equal expected_header, table[0]
 
-      expected_format = ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f', '%.3f', '%s', '%s', '%.3f', '%s', '%s']
+      expected_format = ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f', '%.3f', '%.3f']
       assert_equal expected_format, format
     end
 
@@ -332,7 +330,7 @@
       builder = ResultsTableBuilder.new(
         executable_names: executable_names,
         bench_data: bench_data,
-        include_rss: false
+        include_pvalue: true
       )
 
       table, _format = builder.build
@@ -364,14 +362,43 @@
       builder = ResultsTableBuilder.new(
         executable_names: executable_names,
         bench_data: bench_data,
-        include_rss: false
+        include_pvalue: true
       )
 
       table, _format = builder.build
       assert_equal 'N/A', table[1][-2]
       assert_equal '', table[1].last
     end
 
+    it 'omits p-value columns when include_pvalue is false' do
+      executable_names = ['ruby', 'ruby-yjit']
+      bench_data = {
+        'ruby' => {
+          'fib' => {
+            'warmup' => [0.1],
+            'bench' => [0.100, 0.101, 0.099],
+            'rss' => 1024 * 1024 * 10
+          }
+        },
+        'ruby-yjit' => {
+          'fib' => {
+            'warmup' => [0.05],
+            'bench' => [0.050, 0.051, 0.049],
+            'rss' => 1024 * 1024 * 12
+          }
+        }
+      }
+
+      builder = ResultsTableBuilder.new(
+        executable_names: executable_names,
+        bench_data: bench_data
+      )
+
+      table, _format = builder.build
+      refute_includes table[0], 'p-value'
+      refute_includes table[0], 'sig'
+    end
+
     it 'handles only headline benchmarks' do
       executable_names = ['ruby']
       bench_data = {

Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,8 @@ def run`
`69`	`69`	`builder = ResultsTableBuilder.new(`
`70`	`70`	`executable_names: ruby_descriptions.keys,`
`71`	`71`	`bench_data: bench_data,`
`72`		`- include_rss: args.rss`
	`72`	`+ include_rss: args.rss,`
	`73`	`+ include_pvalue: args.pvalue`
`73`	`74`	`)`
`74`	`75`	`table, format = builder.build`
`75`	`76`