Calculate and add p-value and stddev to summary table

eightbitraptor · eightbitraptor · commit 7bd2f2d1eae6 · 2026-02-18T15:19:00.000Z
Using Welch's t-test for p-values because I don't want to assume that
both versions of Ruby tested are going to have equal timing
distributions.
diff --git a/lib/results_table_builder.rb b/lib/results_table_builder.rb
@@ -47,7 +47,7 @@ def build_header
     end
 
     @other_names.each do |name|
-      header << "#{@base_name}/#{name}"
+      header << "#{@base_name}/#{name}" << "p-value" << "sig"
     end
 
     header
@@ -66,7 +66,7 @@ def build_format
     end
 
     @other_names.each do |_name|
-      format << "%.3f"
+      format << "%.3f" << "%s" << "%s"
     end
 
     format
@@ -105,9 +105,38 @@ def build_comparison_columns(row, other_ts, other_rsss)
 
   def build_ratio_columns(row, base_t0, other_t0s, base_t, other_ts)
     ratio_1sts = other_t0s.map { |other_t0| base_t0 / other_t0 }
-    ratios = other_ts.map { |other_t| mean(base_t) / mean(other_t) }
     row.concat(ratio_1sts)
-    row.concat(ratios)
+
+    other_ts.each do |other_t|
+      pval = Stats.welch_p_value(base_t, other_t)
+      row << mean(base_t) / mean(other_t)
+      row << format_p_value(pval)
+      row << significance_level(pval)
+    end
+  end
+
+  def format_p_value(pval)
+    return "N/A" if pval.nil?
+
+    if pval >= 0.001
+      "%.3f" % pval
+    else
+      "%.1e" % pval
+    end
+  end
+
+  def significance_level(pval)
+    return "" if pval.nil?
+
+    if pval < 0.001
+      "p < 0.001"
+    elsif pval < 0.01
+      "p < 0.01"
+    elsif pval < 0.05
+      "p < 0.05"
+    else
+      ""
+    end
   end
 
   def extract_first_iteration_times(bench_name)
diff --git a/misc/stats.rb b/misc/stats.rb
@@ -1,4 +1,101 @@
 class Stats
+  class << self
+    # Welch's t-test (two-tailed). Returns the p-value, or nil if
+    # either sample is too small to compute a meaningful test.
+    def welch_p_value(a, b)
+      return nil if a.size < 2 || b.size < 2
+
+      stats_a = new(a)
+      stats_b = new(b)
+
+      n_a = a.size.to_f
+      n_b = b.size.to_f
+      var_a = stats_a.sample_variance
+      var_b = stats_b.sample_variance
+
+      se_sq = var_a / n_a + var_b / n_b
+      if se_sq == 0.0
+        # Both samples have zero variance — if means match they're
+        # indistinguishable, otherwise they're trivially different.
+        return stats_a.mean == stats_b.mean ? 1.0 : 0.0
+      end
+
+      t = (stats_a.mean - stats_b.mean) / Math.sqrt(se_sq)
+
+      # Welch-Satterthwaite degrees of freedom
+      df = se_sq ** 2 / ((var_a / n_a) ** 2 / (n_a - 1) + (var_b / n_b) ** 2 / (n_b - 1))
+
+      # Two-tailed p-value: I_x(df/2, 1/2) where x = df/(df + t^2)
+      x = df / (df + t * t)
+      regularized_incomplete_beta(x, df / 2.0, 0.5)
+    end
+
+    private
+
+    # Regularized incomplete beta function I_x(alpha, beta) via continued fraction (Lentz's method).
+    # Returns the probability that a Beta(alpha, beta)-distributed variable is <= x.
+    def regularized_incomplete_beta(x, alpha, beta)
+      return 0.0 if x <= 0.0
+      return 1.0 if x >= 1.0
+
+      # Symmetry relation: pick the side that converges faster
+      if x > (alpha + 1.0) / (alpha + beta + 2.0)
+        return 1.0 - regularized_incomplete_beta(1.0 - x, beta, alpha)
+      end
+
+      # B(alpha, beta) * x^alpha * (1-x)^beta — computed in log-space to avoid overflow
+      ln_normalizer = Math.lgamma(alpha + beta)[0] - Math.lgamma(alpha)[0] - Math.lgamma(beta)[0] +
+                      alpha * Math.log(x) + beta * Math.log(1.0 - x)
+      normalizer = Math.exp(ln_normalizer)
+
+      normalizer * beta_continued_fraction(x, alpha, beta) / alpha
+    end
+
+    # Evaluates the continued fraction for I_x(alpha, beta) using Lentz's algorithm.
+    # Each iteration computes two sub-steps (even and odd terms of the fraction).
+    def beta_continued_fraction(x, alpha, beta)
+      floor = 1.0e-30 # prevent division by zero in Lentz's method
+      converged = false
+
+      numerator_term = 1.0
+      denominator_term = 1.0 - (alpha + beta) * x / (alpha + 1.0)
+      denominator_term = floor if denominator_term.abs < floor
+      denominator_term = 1.0 / denominator_term
+      fraction = denominator_term
+
+      (1..200).each do |iteration|
+        two_i = 2 * iteration
+
+        # Even sub-step: d_{2m} coefficient of the continued fraction
+        coeff = iteration * (beta - iteration) * x / ((alpha + two_i - 1.0) * (alpha + two_i))
+        denominator_term = 1.0 + coeff * denominator_term
+        denominator_term = floor if denominator_term.abs < floor
+        numerator_term = 1.0 + coeff / numerator_term
+        numerator_term = floor if numerator_term.abs < floor
+        denominator_term = 1.0 / denominator_term
+        fraction *= denominator_term * numerator_term
+
+        # Odd sub-step: d_{2m+1} coefficient of the continued fraction
+        coeff = -(alpha + iteration) * (alpha + beta + iteration) * x / ((alpha + two_i) * (alpha + two_i + 1.0))
+        denominator_term = 1.0 + coeff * denominator_term
+        denominator_term = floor if denominator_term.abs < floor
+        numerator_term = 1.0 + coeff / numerator_term
+        numerator_term = floor if numerator_term.abs < floor
+        denominator_term = 1.0 / denominator_term
+        correction = denominator_term * numerator_term
+        fraction *= correction
+
+        if (correction - 1.0).abs < 1.0e-10
+          converged = true
+          break
+        end
+      end
+
+      warn "Stats.beta_continued_fraction: did not converge (alpha=#{alpha}, beta=#{beta}, x=#{x})" unless converged
+      fraction
+    end
+  end
+
   def initialize(data)
     @data = data
   end
@@ -15,13 +112,20 @@ def mean
     @data.sum(0.0) / @data.size
   end
 
+  # Population standard deviation (N denominator) — describes these specific values.
   def stddev
     mean = self.mean
     diffs_squared = @data.map { |v| (v-mean) * (v-mean) }
     mean_squared = diffs_squared.sum(0.0) / @data.size
     Math.sqrt(mean_squared)
   end
 
+  # Unbiased sample variance (N-1 denominator, Bessel's correction) — for inference.
+  def sample_variance
+    m = mean
+    @data.sum { |v| (v - m) ** 2 } / (@data.size - 1).to_f
+  end
+
   def median
     compute_median(@data)
   end
diff --git a/test/results_table_builder_test.rb b/test/results_table_builder_test.rb
@@ -56,15 +56,17 @@
 
       table, format = builder.build
 
-      assert_equal ['bench', 'ruby (ms)', 'stddev (%)', 'ruby-yjit (ms)', 'stddev (%)', 'ruby-yjit 1st itr', 'ruby/ruby-yjit'], table[0]
+      assert_equal ['bench', 'ruby (ms)', 'stddev (%)', 'ruby-yjit (ms)', 'stddev (%)', 'ruby-yjit 1st itr', 'ruby/ruby-yjit', 'p-value', 'sig'], table[0]
 
-      assert_equal ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f'], format
+      assert_equal ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f', '%s', '%s'], format
 
       assert_equal 'fib', table[1][0]
       assert_in_delta 100.0, table[1][1], 1.0
       assert_in_delta 50.0, table[1][3], 1.0
       assert_in_delta 2.0, table[1][5], 0.1
       assert_in_delta 2.0, table[1][6], 0.1
+      assert_instance_of String, table[1][7]
+      assert_instance_of String, table[1][8]
     end
 
     it 'includes RSS columns when include_rss is true' do
@@ -171,12 +173,12 @@
         'ruby-rjit (ms)', 'stddev (%)',
         'ruby-yjit 1st itr',
         'ruby-rjit 1st itr',
-        'ruby/ruby-yjit',
-        'ruby/ruby-rjit'
+        'ruby/ruby-yjit', 'p-value', 'sig',
+        'ruby/ruby-rjit', 'p-value', 'sig'
       ]
       assert_equal expected_header, table[0]
 
-      expected_format = ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f', '%.3f', '%.3f']
+      expected_format = ['%s', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.1f', '%.3f', '%.3f', '%.3f', '%s', '%s', '%.3f', '%s', '%s']
       assert_equal expected_format, format
     end
 
@@ -308,6 +310,68 @@
       assert_equal 'fib', table[1][0]
     end
 
+    it 'shows small p-value in scientific notation for clearly different distributions' do
+      executable_names = ['ruby', 'ruby-yjit']
+      bench_data = {
+        'ruby' => {
+          'fib' => {
+            'warmup' => [0.1],
+            'bench' => [0.100, 0.101, 0.099, 0.1005, 0.0995, 0.1002, 0.0998, 0.1001, 0.0999, 0.1003],
+            'rss' => 1024 * 1024 * 10
+          }
+        },
+        'ruby-yjit' => {
+          'fib' => {
+            'warmup' => [0.05],
+            'bench' => [0.050, 0.051, 0.049, 0.0505, 0.0495, 0.0502, 0.0498, 0.0501, 0.0499, 0.0503],
+            'rss' => 1024 * 1024 * 12
+          }
+        }
+      }
+
+      builder = ResultsTableBuilder.new(
+        executable_names: executable_names,
+        bench_data: bench_data,
+        include_rss: false
+      )
+
+      table, _format = builder.build
+      p_value_str = table[1][-2]
+      sig_str = table[1].last
+      assert_match(/e-/, p_value_str, "Expected scientific notation for very small p-value, got #{p_value_str}")
+      assert_equal "p < 0.001", sig_str
+    end
+
+    it 'shows N/A p-value when samples have fewer than 2 elements' do
+      executable_names = ['ruby', 'ruby-yjit']
+      bench_data = {
+        'ruby' => {
+          'fib' => {
+            'warmup' => [0.1],
+            'bench' => [0.1],
+            'rss' => 1024 * 1024 * 10
+          }
+        },
+        'ruby-yjit' => {
+          'fib' => {
+            'warmup' => [0.05],
+            'bench' => [0.05],
+            'rss' => 1024 * 1024 * 12
+          }
+        }
+      }
+
+      builder = ResultsTableBuilder.new(
+        executable_names: executable_names,
+        bench_data: bench_data,
+        include_rss: false
+      )
+
+      table, _format = builder.build
+      assert_equal 'N/A', table[1][-2]
+      assert_equal '', table[1].last
+    end
+
     it 'handles only headline benchmarks' do
       executable_names = ['ruby']
       bench_data = {
diff --git a/test/stats_test.rb b/test/stats_test.rb
@@ -0,0 +1,72 @@
+require_relative 'test_helper'
+require_relative '../misc/stats'
+
+describe Stats do
+  describe '#sample_variance' do
+    it 'computes unbiased sample variance (n-1 denominator)' do
+      stats = Stats.new([2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0])
+      assert_in_delta 4.571, stats.sample_variance, 0.001
+    end
+
+    it 'equals population variance * n/(n-1)' do
+      data = [10.0, 12.0, 14.0, 16.0, 18.0]
+      stats = Stats.new(data)
+      n = data.size.to_f
+      assert_in_delta stats.stddev ** 2 * n / (n - 1), stats.sample_variance, 1e-10
+    end
+  end
+
+  describe '.welch_p_value' do
+    it 'returns 1.0 for identical distributions' do
+      a = [1.0, 1.0, 1.0, 1.0, 1.0]
+      b = [1.0, 1.0, 1.0, 1.0, 1.0]
+      assert_in_delta 1.0, Stats.welch_p_value(a, b), 0.001
+    end
+
+    it 'returns small p-value for clearly different distributions' do
+      a = [100.0, 101.0, 99.0, 100.5, 99.5, 100.2, 99.8, 100.1, 99.9, 100.3]
+      b = [50.0, 51.0, 49.0, 50.5, 49.5, 50.2, 49.8, 50.1, 49.9, 50.3]
+      p_value = Stats.welch_p_value(a, b)
+      assert p_value < 0.001, "Expected p < 0.001, got #{p_value}"
+    end
+
+    it 'returns high p-value for overlapping distributions' do
+      a = [10.0, 11.0, 9.0, 10.5, 9.5]
+      b = [10.1, 10.9, 9.1, 10.6, 9.4]
+      p_value = Stats.welch_p_value(a, b)
+      assert p_value > 0.5, "Expected p > 0.5, got #{p_value}"
+    end
+
+    it 'is symmetric' do
+      a = [100.0, 102.0, 98.0, 101.0, 99.0]
+      b = [90.0, 92.0, 88.0, 91.0, 89.0]
+      assert_in_delta Stats.welch_p_value(a, b), Stats.welch_p_value(b, a), 1e-10
+    end
+
+    it 'handles different sample sizes' do
+      a = [100.0, 101.0, 99.0]
+      b = [50.0, 51.0, 49.0, 50.5, 49.5, 50.2, 49.8, 50.1, 49.9, 50.3]
+      p_value = Stats.welch_p_value(a, b)
+      assert p_value < 0.001, "Expected p < 0.001, got #{p_value}"
+    end
+
+    it 'handles unequal variances' do
+      a = [100.0, 100.0, 100.0, 100.0, 100.0]  # zero variance
+      b = [50.0, 60.0, 40.0, 70.0, 30.0]        # high variance
+      p_value = Stats.welch_p_value(a, b)
+      assert p_value < 0.05, "Expected p < 0.05, got #{p_value}"
+    end
+
+    it 'returns 0.0 when both samples have zero variance but different means' do
+      a = [10.0, 10.0, 10.0, 10.0, 10.0]
+      b = [5.0, 5.0, 5.0, 5.0, 5.0]
+      assert_equal 0.0, Stats.welch_p_value(a, b)
+    end
+
+    it 'returns nil when a sample has fewer than 2 elements' do
+      assert_nil Stats.welch_p_value([10.0], [5.0, 6.0, 7.0])
+      assert_nil Stats.welch_p_value([5.0, 6.0, 7.0], [10.0])
+      assert_nil Stats.welch_p_value([10.0], [5.0])
+    end
+  end
+end