Add asterisks to the comparison for p-value

eightbitraptor · eightbitraptor · commit bf3b7b31976d · 2026-02-18T17:16:06.000Z
actual p-value still gated behind --pvalue
diff --git a/lib/benchmark_runner.rb b/lib/benchmark_runner.rb
@@ -66,6 +66,7 @@ def build_output_text(ruby_descriptions, table, format, bench_failures)
           output_str << "- #{name} 1st itr: ratio of #{base_name}/#{name} time for the first benchmarking iteration.\n"
           output_str << "- #{base_name}/#{name}: ratio of #{base_name}/#{name} time. Higher is better for #{name}. Above 1 represents a speedup.\n"
         end
+        output_str << "- ***: p < 0.001, **: p < 0.01, *: p < 0.05 (Welch's t-test)\n"
       end
 
       output_str
diff --git a/lib/results_table_builder.rb b/lib/results_table_builder.rb
@@ -70,7 +70,7 @@ def build_format
     end
 
     @other_names.each do |_name|
-      format << "%.3f"
+      format << "%s"
       if @include_pvalue
         format << "%s" << "%s"
       end
@@ -115,15 +115,21 @@ def build_ratio_columns(row, base_t0, other_t0s, base_t, other_ts)
     row.concat(ratio_1sts)
 
     other_ts.each do |other_t|
-      row << mean(base_t) / mean(other_t)
+      pval = Stats.welch_p_value(base_t, other_t)
+      row << format_ratio(mean(base_t) / mean(other_t), pval)
       if @include_pvalue
-        pval = Stats.welch_p_value(base_t, other_t)
         row << format_p_value(pval)
         row << significance_level(pval)
       end
     end
   end
 
+  def format_ratio(ratio, pval)
+    sym = significance_symbol(pval)
+    formatted = "%.3f" % ratio
+    sym.empty? ? formatted : "#{formatted} (#{sym})"
+  end
+
   def format_p_value(pval)
     return "N/A" if pval.nil?
 
@@ -134,6 +140,20 @@ def format_p_value(pval)
     end
   end
 
+  def significance_symbol(pval)
+    return "" if pval.nil?
+
+    if pval < 0.001
+      "***"
+    elsif pval < 0.01
+      "**"
+    elsif pval < 0.05
+      "*"
+    else
+      ""
+    end
+  end
+
   def significance_level(pval)
     return "" if pval.nil?
 
diff --git a/test/benchmark_runner_test.rb b/test/benchmark_runner_test.rb
@@ -387,6 +387,7 @@
       assert_includes result, 'Legend:'
       assert_includes result, '- ruby-yjit 1st itr: ratio of ruby-base/ruby-yjit time for the first benchmarking iteration.'
       assert_includes result, '- ruby-base/ruby-yjit: ratio of ruby-base/ruby-yjit time. Higher is better for ruby-yjit. Above 1 represents a speedup.'
+      assert_includes result, "- ***: p < 0.001, **: p < 0.01, *: p < 0.05 (Welch's t-test)"
     end
 
     it 'includes formatted table in output' do
diff --git a/test/results_table_builder_test.rb b/test/results_table_builder_test.rb
@@ -58,13 +58,13 @@
 
       assert_equal ['bench', 'ruby (ms)', 'stddev (%)', 'ruby-yjit (ms)', 'stddev (%)', 'ruby-yjit 1st itr', 'ruby/ruby-yjit'], table[0]
 
-      assert_equal ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f'], format
+      assert_equal ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%s'], format
 
       assert_equal 'fib', table[1][0]
       assert_in_delta 100.0, table[1][1], 1.0
       assert_in_delta 50.0, table[1][3], 1.0
       assert_in_delta 2.0, table[1][5], 0.1
-      assert_in_delta 2.0, table[1][6], 0.1
+      assert_match(/^2\.0\d+/, table[1][6])
     end
 
     it 'includes RSS columns when include_rss is true' do
@@ -176,7 +176,7 @@
       ]
       assert_equal expected_header, table[0]
 
-      expected_format = ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f', '%.3f', '%.3f']
+      expected_format = ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f', '%s', '%s']
       assert_equal expected_format, format
     end
 
@@ -370,7 +370,7 @@
       assert_equal '', table[1].last
     end
 
-    it 'omits p-value columns when include_pvalue is false' do
+    it 'always shows significance symbol but omits verbose columns without --pvalue' do
       executable_names = ['ruby', 'ruby-yjit']
       bench_data = {
         'ruby' => {
@@ -397,6 +397,7 @@
       table, _format = builder.build
       refute_includes table[0], 'p-value'
       refute_includes table[0], 'sig'
+      assert_match(/\(\*{1,3}\)$/, table[1].last)
     end
 
     it 'handles only headline benchmarks' do