Merge pull request #479 from ruby/mvh-add-p-value

eightbitraptor · web-flow · commit c64ac863da18 · 2026-02-18T17:32:58.000Z
Calculate and add p-value and stddev to summary table
diff --git a/lib/argument_parser.rb b/lib/argument_parser.rb
@@ -23,6 +23,7 @@ class ArgumentParser
     :skip_yjit,
     :skip_zjit,
     :with_pre_init,
+    :pvalue,
     keyword_init: true
   )
 
@@ -144,6 +145,10 @@ def parse(argv)
         args.rss = true
       end
 
+      opts.on("--pvalue", "show p-value and significance columns for each comparison") do
+        args.pvalue = true
+      end
+
       opts.on("--graph", "generate a graph image of benchmark results") do
         args.graph = true
       end
@@ -224,6 +229,7 @@ def default_args
       name_filters: [],
       excludes: [],
       rss: false,
+      pvalue: false,
       graph: false,
       no_pinning: false,
       force_pinning: false,
diff --git a/lib/benchmark_runner.rb b/lib/benchmark_runner.rb
@@ -66,6 +66,7 @@ def build_output_text(ruby_descriptions, table, format, bench_failures)
           output_str << "- #{name} 1st itr: ratio of #{base_name}/#{name} time for the first benchmarking iteration.\n"
           output_str << "- #{base_name}/#{name}: ratio of #{base_name}/#{name} time. Higher is better for #{name}. Above 1 represents a speedup.\n"
         end
+        output_str << "- ***: p < 0.001, **: p < 0.01, *: p < 0.05 (Welch's t-test)\n"
       end
 
       output_str
diff --git a/lib/benchmark_runner/cli.rb b/lib/benchmark_runner/cli.rb
@@ -69,7 +69,8 @@ def run
       builder = ResultsTableBuilder.new(
         executable_names: ruby_descriptions.keys,
         bench_data: bench_data,
-        include_rss: args.rss
+        include_rss: args.rss,
+        include_pvalue: args.pvalue
       )
       table, format = builder.build
 
diff --git a/lib/results_table_builder.rb b/lib/results_table_builder.rb
@@ -5,10 +5,11 @@ class ResultsTableBuilder
   SECONDS_TO_MS = 1000.0
   BYTES_TO_MIB = 1024.0 * 1024.0
 
-  def initialize(executable_names:, bench_data:, include_rss: false)
+  def initialize(executable_names:, bench_data:, include_rss: false, include_pvalue: false)
     @executable_names = executable_names
     @bench_data = bench_data
     @include_rss = include_rss
+    @include_pvalue = include_pvalue
     @base_name = executable_names.first
     @other_names = executable_names[1..]
     @bench_names = compute_bench_names
@@ -48,6 +49,9 @@ def build_header
 
     @other_names.each do |name|
       header << "#{@base_name}/#{name}"
+      if @include_pvalue
+        header << "p-value" << "sig"
+      end
     end
 
     header
@@ -66,7 +70,10 @@ def build_format
     end
 
     @other_names.each do |_name|
-      format << "%.3f"
+      format << "%s"
+      if @include_pvalue
+        format << "%s" << "%s"
+      end
     end
 
     format
@@ -105,9 +112,60 @@ def build_comparison_columns(row, other_ts, other_rsss)
 
   def build_ratio_columns(row, base_t0, other_t0s, base_t, other_ts)
     ratio_1sts = other_t0s.map { |other_t0| base_t0 / other_t0 }
-    ratios = other_ts.map { |other_t| mean(base_t) / mean(other_t) }
     row.concat(ratio_1sts)
-    row.concat(ratios)
+
+    other_ts.each do |other_t|
+      pval = Stats.welch_p_value(base_t, other_t)
+      row << format_ratio(mean(base_t) / mean(other_t), pval)
+      if @include_pvalue
+        row << format_p_value(pval)
+        row << significance_level(pval)
+      end
+    end
+  end
+
+  def format_ratio(ratio, pval)
+    sym = significance_symbol(pval)
+    formatted = "%.3f" % ratio
+    sym.empty? ? formatted : "#{formatted} (#{sym})"
+  end
+
+  def format_p_value(pval)
+    return "N/A" if pval.nil?
+
+    if pval >= 0.001
+      "%.3f" % pval
+    else
+      "%.1e" % pval
+    end
+  end
+
+  def significance_symbol(pval)
+    return "" if pval.nil?
+
+    if pval < 0.001
+      "***"
+    elsif pval < 0.01
+      "**"
+    elsif pval < 0.05
+      "*"
+    else
+      ""
+    end
+  end
+
+  def significance_level(pval)
+    return "" if pval.nil?
+
+    if pval < 0.001
+      "p < 0.001"
+    elsif pval < 0.01
+      "p < 0.01"
+    elsif pval < 0.05
+      "p < 0.05"
+    else
+      ""
+    end
   end
 
   def extract_first_iteration_times(bench_name)
diff --git a/misc/stats.rb b/misc/stats.rb
@@ -1,4 +1,101 @@
 class Stats
+  class << self
+    # Welch's t-test (two-tailed). Returns the p-value, or nil if
+    # either sample is too small to compute a meaningful test.
+    def welch_p_value(a, b)
+      return nil if a.size < 2 || b.size < 2
+
+      stats_a = new(a)
+      stats_b = new(b)
+
+      n_a = a.size.to_f
+      n_b = b.size.to_f
+      var_a = stats_a.sample_variance
+      var_b = stats_b.sample_variance
+
+      se_sq = var_a / n_a + var_b / n_b
+      if se_sq == 0.0
+        # Both samples have zero variance — if means match they're
+        # indistinguishable, otherwise they're trivially different.
+        return stats_a.mean == stats_b.mean ? 1.0 : 0.0
+      end
+
+      t = (stats_a.mean - stats_b.mean) / Math.sqrt(se_sq)
+
+      # Welch-Satterthwaite degrees of freedom
+      df = se_sq ** 2 / ((var_a / n_a) ** 2 / (n_a - 1) + (var_b / n_b) ** 2 / (n_b - 1))
+
+      # Two-tailed p-value: I_x(df/2, 1/2) where x = df/(df + t^2)
+      x = df / (df + t * t)
+      regularized_incomplete_beta(x, df / 2.0, 0.5)
+    end
+
+    private
+
+    # Regularized incomplete beta function I_x(alpha, beta) via continued fraction (Lentz's method).
+    # Returns the probability that a Beta(alpha, beta)-distributed variable is <= x.
+    def regularized_incomplete_beta(x, alpha, beta)
+      return 0.0 if x <= 0.0
+      return 1.0 if x >= 1.0
+
+      # Symmetry relation: pick the side that converges faster
+      if x > (alpha + 1.0) / (alpha + beta + 2.0)
+        return 1.0 - regularized_incomplete_beta(1.0 - x, beta, alpha)
+      end
+
+      # B(alpha, beta) * x^alpha * (1-x)^beta — computed in log-space to avoid overflow
+      ln_normalizer = Math.lgamma(alpha + beta)[0] - Math.lgamma(alpha)[0] - Math.lgamma(beta)[0] +
+                      alpha * Math.log(x) + beta * Math.log(1.0 - x)
+      normalizer = Math.exp(ln_normalizer)
+
+      normalizer * beta_continued_fraction(x, alpha, beta) / alpha
+    end
+
+    # Evaluates the continued fraction for I_x(alpha, beta) using Lentz's algorithm.
+    # Each iteration computes two sub-steps (even and odd terms of the fraction).
+    def beta_continued_fraction(x, alpha, beta)
+      floor = 1.0e-30 # prevent division by zero in Lentz's method
+      converged = false
+
+      numerator_term = 1.0
+      denominator_term = 1.0 - (alpha + beta) * x / (alpha + 1.0)
+      denominator_term = floor if denominator_term.abs < floor
+      denominator_term = 1.0 / denominator_term
+      fraction = denominator_term
+
+      (1..200).each do |iteration|
+        two_i = 2 * iteration
+
+        # Even sub-step: d_{2m} coefficient of the continued fraction
+        coeff = iteration * (beta - iteration) * x / ((alpha + two_i - 1.0) * (alpha + two_i))
+        denominator_term = 1.0 + coeff * denominator_term
+        denominator_term = floor if denominator_term.abs < floor
+        numerator_term = 1.0 + coeff / numerator_term
+        numerator_term = floor if numerator_term.abs < floor
+        denominator_term = 1.0 / denominator_term
+        fraction *= denominator_term * numerator_term
+
+        # Odd sub-step: d_{2m+1} coefficient of the continued fraction
+        coeff = -(alpha + iteration) * (alpha + beta + iteration) * x / ((alpha + two_i) * (alpha + two_i + 1.0))
+        denominator_term = 1.0 + coeff * denominator_term
+        denominator_term = floor if denominator_term.abs < floor
+        numerator_term = 1.0 + coeff / numerator_term
+        numerator_term = floor if numerator_term.abs < floor
+        denominator_term = 1.0 / denominator_term
+        correction = denominator_term * numerator_term
+        fraction *= correction
+
+        if (correction - 1.0).abs < 1.0e-10
+          converged = true
+          break
+        end
+      end
+
+      warn "Stats.beta_continued_fraction: did not converge (alpha=#{alpha}, beta=#{beta}, x=#{x})" unless converged
+      fraction
+    end
+  end
+
   def initialize(data)
     @data = data
   end
@@ -15,13 +112,20 @@ def mean
     @data.sum(0.0) / @data.size
   end
 
+  # Population standard deviation (N denominator) — describes these specific values.
   def stddev
     mean = self.mean
     diffs_squared = @data.map { |v| (v-mean) * (v-mean) }
     mean_squared = diffs_squared.sum(0.0) / @data.size
     Math.sqrt(mean_squared)
   end
 
+  # Unbiased sample variance (N-1 denominator, Bessel's correction) — for inference.
+  def sample_variance
+    m = mean
+    @data.sum { |v| (v - m) ** 2 } / (@data.size - 1).to_f
+  end
+
   def median
     compute_median(@data)
   end
diff --git a/test/argument_parser_test.rb b/test/argument_parser_test.rb
@@ -49,6 +49,7 @@ def setup_mock_ruby(path)
         assert_equal [], args.categories
         assert_equal [], args.name_filters
         assert_equal false, args.rss
+        assert_equal false, args.pvalue
         assert_equal false, args.graph
         assert_equal false, args.no_pinning
         assert_equal false, args.turbo
@@ -428,6 +429,15 @@ def setup_mock_ruby(path)
       end
     end
 
+    describe '--pvalue option' do
+      it 'sets pvalue flag' do
+        parser = ArgumentParser.new
+        args = parser.parse(['--pvalue'])
+
+        assert_equal true, args.pvalue
+      end
+    end
+
     describe '--graph option' do
       it 'sets graph flag' do
         parser = ArgumentParser.new
diff --git a/test/benchmark_runner_test.rb b/test/benchmark_runner_test.rb
@@ -387,6 +387,7 @@
       assert_includes result, 'Legend:'
       assert_includes result, '- ruby-yjit 1st itr: ratio of ruby-base/ruby-yjit time for the first benchmarking iteration.'
       assert_includes result, '- ruby-base/ruby-yjit: ratio of ruby-base/ruby-yjit time. Higher is better for ruby-yjit. Above 1 represents a speedup.'
+      assert_includes result, "- ***: p < 0.001, **: p < 0.01, *: p < 0.05 (Welch's t-test)"
     end
 
     it 'includes formatted table in output' do
diff --git a/test/results_table_builder_test.rb b/test/results_table_builder_test.rb
diff --git a/test/stats_test.rb b/test/stats_test.rb

Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,8 @@ def run`
`69`	`69`	`builder = ResultsTableBuilder.new(`
`70`	`70`	`executable_names: ruby_descriptions.keys,`
`71`	`71`	`bench_data: bench_data,`
`72`		`- include_rss: args.rss`
	`72`	`+ include_rss: args.rss,`
	`73`	`+ include_pvalue: args.pvalue`
`73`	`74`	`)`
`74`	`75`	`table, format = builder.build`
`75`	`76`