From 5a57efc02911731bbf13e3f5c713ca2fb010afee Mon Sep 17 00:00:00 2001
From: Andrew Nesbitt <andrewnez@gmail.com>
Date: Sun, 4 Jan 2026 16:48:46 +0000
Subject: [PATCH] Add benchmark scripts and documentation

---
 CONTRIBUTING.md                               |  2 +-
 benchmark_bulk.rb => benchmark/bulk.rb        |  0
 benchmark/commands.rb                         | 91 ++++++++++++++++++
 benchmark_db.rb => benchmark/db.rb            |  0
 .../detailed.rb                               |  0
 benchmark_full.rb => benchmark/full.rb        |  0
 bin/benchmark                                 | 30 ++++++
 docs/README.md                                |  9 ++
 docs/benchmarking.md                          | 94 +++++++++++++++++++
 git-pkgs.gemspec                              |  3 +-
 10 files changed, 227 insertions(+), 2 deletions(-)
 rename benchmark_bulk.rb => benchmark/bulk.rb (100%)
 create mode 100644 benchmark/commands.rb
 rename benchmark_db.rb => benchmark/db.rb (100%)
 rename benchmark_detailed.rb => benchmark/detailed.rb (100%)
 rename benchmark_full.rb => benchmark/full.rb (100%)
 create mode 100755 bin/benchmark
 create mode 100644 docs/README.md
 create mode 100644 docs/benchmarking.md

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index cebe98d..d6332e5 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -9,7 +9,7 @@ The database schema stores:
 - Dependency changes (added/modified/removed) with before/after versions
 - Periodic snapshots of full dependency state for efficient point-in-time queries
 
-See [docs/internals.md](docs/internals.md) for a detailed architecture overview and [docs/schema.md](docs/schema.md) for the database schema.
+See the [docs](docs/) folder for architecture details, database schema, and benchmarking tools.
 
 Since the database is just SQLite, you can query it directly for ad-hoc analysis:
 
diff --git a/benchmark_bulk.rb b/benchmark/bulk.rb
similarity index 100%
rename from benchmark_bulk.rb
rename to benchmark/bulk.rb
diff --git a/benchmark/commands.rb b/benchmark/commands.rb
new file mode 100644
index 0000000..dd759fd
--- /dev/null
+++ b/benchmark/commands.rb
@@ -0,0 +1,91 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+require "benchmark"
+require "optparse"
+
+options = {
+  iterations: 3,
+  repo: nil
+}
+
+OptionParser.new do |opts|
+  opts.banner = "Usage: bin/benchmark commands [options]"
+
+  opts.on("-r", "--repo=PATH", "Path to repository to benchmark against") do |v|
+    options[:repo] = v
+  end
+
+  opts.on("-n", "--iterations=N", Integer, "Number of iterations per command (default: 3)") do |v|
+    options[:iterations] = v
+  end
+
+  opts.on("-h", "--help", "Show this help") do
+    puts opts
+    exit
+  end
+end.parse!
+
+unless options[:repo]
+  puts "Error: --repo is required"
+  puts "Usage: bin/benchmark commands --repo /path/to/repo"
+  exit 1
+end
+
+repo_path = File.expand_path(options[:repo])
+unless File.directory?(repo_path)
+  puts "Error: #{repo_path} is not a directory"
+  exit 1
+end
+
+unless File.exist?(File.join(repo_path, ".git", "pkgs.sqlite3"))
+  puts "Error: #{repo_path} does not have a git-pkgs database"
+  puts "Run 'git pkgs init' in that repository first"
+  exit 1
+end
+
+iterations = options[:iterations]
+gem_root = File.expand_path("../..", __FILE__)
+
+# Use bundle exec to ensure we run the local development version
+commands = {
+  "blame" => "bundle exec --gemfile=#{gem_root}/Gemfile ruby -I#{gem_root}/lib #{gem_root}/exe/git-pkgs blame --no-pager",
+  "stale" => "bundle exec --gemfile=#{gem_root}/Gemfile ruby -I#{gem_root}/lib #{gem_root}/exe/git-pkgs stale --no-pager",
+  "stats" => "bundle exec --gemfile=#{gem_root}/Gemfile ruby -I#{gem_root}/lib #{gem_root}/exe/git-pkgs stats --no-pager",
+  "log" => "bundle exec --gemfile=#{gem_root}/Gemfile ruby -I#{gem_root}/lib #{gem_root}/exe/git-pkgs log --no-pager",
+  "list" => "bundle exec --gemfile=#{gem_root}/Gemfile ruby -I#{gem_root}/lib #{gem_root}/exe/git-pkgs list --no-pager"
+}
+
+puts "Command benchmarks"
+puts "=" * 60
+puts "Repository: #{repo_path}"
+puts "Iterations: #{iterations}"
+puts
+
+results = {}
+
+Dir.chdir(repo_path) do
+  commands.each do |name, cmd|
+    times = []
+
+    # Warmup run
+    system(cmd, out: File::NULL, err: File::NULL)
+
+    iterations.times do
+      time = Benchmark.realtime do
+        system(cmd, out: File::NULL, err: File::NULL)
+      end
+      times << time
+    end
+
+    avg = times.sum / times.size
+    min = times.min
+    max = times.max
+    results[name] = { avg: avg, min: min, max: max }
+
+    puts format("%-10s avg: %6.3fs  min: %6.3fs  max: %6.3fs", name, avg, min, max)
+  end
+end
+
+puts
+puts "Total average: #{format("%.3fs", results.values.sum { |r| r[:avg] })}"
diff --git a/benchmark_db.rb b/benchmark/db.rb
similarity index 100%
rename from benchmark_db.rb
rename to benchmark/db.rb
diff --git a/benchmark_detailed.rb b/benchmark/detailed.rb
similarity index 100%
rename from benchmark_detailed.rb
rename to benchmark/detailed.rb
diff --git a/benchmark_full.rb b/benchmark/full.rb
similarity index 100%
rename from benchmark_full.rb
rename to benchmark/full.rb
diff --git a/bin/benchmark b/bin/benchmark
new file mode 100755
index 0000000..1ab7a1e
--- /dev/null
+++ b/bin/benchmark
@@ -0,0 +1,30 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+BENCHMARKS = {
+  "full" => "Full pipeline benchmark",
+  "detailed" => "Detailed phase breakdown",
+  "bulk" => "Bulk insert benchmark",
+  "db" => "DB operation breakdown",
+  "commands" => "CLI command benchmarks"
+}.freeze
+
+def usage
+  puts "Usage: bin/benchmark <type> [repo_path] [sample_size]"
+  puts "       bin/benchmark commands --repo /path/to/repo [-n iterations]"
+  puts
+  puts "Types:"
+  BENCHMARKS.each do |name, desc|
+    puts "  #{name.ljust(10)} #{desc}"
+  end
+  puts
+  puts "Example: bin/benchmark full /path/to/repo 500"
+  exit 1
+end
+
+type = ARGV.shift
+usage if type.nil? || type == "-h" || type == "--help"
+usage unless BENCHMARKS.key?(type)
+
+script = File.expand_path("../benchmark/#{type}.rb", __dir__)
+exec("ruby", script, *ARGV)
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000..6f302d5
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,9 @@
+# Documentation
+
+Technical documentation for git-pkgs maintainers and contributors.
+
+- [internals.md](internals.md) - Architecture overview, how commands work, key algorithms
+- [schema.md](schema.md) - Database tables and relationships
+- [benchmarking.md](benchmarking.md) - Performance profiling tools
+
+For user-facing documentation, see the main [README](../README.md).
diff --git a/docs/benchmarking.md b/docs/benchmarking.md
new file mode 100644
index 0000000..892ecd4
--- /dev/null
+++ b/docs/benchmarking.md
@@ -0,0 +1,94 @@
+# Benchmarking
+
+git-pkgs includes benchmark scripts for profiling performance. Run them with:
+
+```bash
+bin/benchmark <type> [repo_path] [sample_size]
+```
+
+The default repo is `/Users/andrew/code/octobox` and sample size is 500 commits.
+
+## Benchmark Types
+
+### full
+
+Full pipeline benchmark with phase breakdown:
+
+```bash
+bin/benchmark full /path/to/repo 500
+```
+
+Measures time spent in each phase: git diff extraction, manifest filtering, parsing, and database writes. Reports overall throughput in commits/sec.
+
+### detailed
+
+Granular breakdown of each processing step:
+
+```bash
+bin/benchmark detailed /path/to/repo 500
+```
+
+Shows timing for blob path extraction, regex pre-filtering, bibliothecary identification, and manifest parsing. Also breaks down parsing time by platform (rubygems, npm, etc.) and reports how many commits pass each filter stage.
+
+### bulk
+
+Compares data collection vs bulk insert performance:
+
+```bash
+bin/benchmark bulk /path/to/repo 500
+```
+
+Separates the time spent analyzing commits from the time spent writing to the database. Uses `insert_all` for bulk operations. Helps identify whether bottlenecks are in git/parsing or database writes.
+
+### db
+
+Individual database operation timing:
+
+```bash
+bin/benchmark db /path/to/repo 200
+```
+
+Measures each ActiveRecord operation separately: commit creation, branch_commit creation, manifest lookups, change inserts, and snapshot inserts. Shows per-operation averages in milliseconds.
+
+### commands
+
+End-to-end CLI command benchmarks:
+
+```bash
+bin/benchmark commands --repo /path/to/repo -n 3
+```
+
+Runs actual git-pkgs commands (`blame`, `stale`, `stats`, `log`, `list`) against a repo with an existing database. Measures wall-clock time over multiple iterations. Useful for regression testing command performance.
+
+The repo must already have a database from `git pkgs init`.
+
+## Interpreting Results
+
+The main bottlenecks are typically:
+
+1. **Git blob reads** - extracting file contents from commits
+2. **Bibliothecary parsing** - parsing manifest file contents
+3. **Database writes** - inserting records (mitigated by bulk inserts)
+
+The regex pre-filter (`might_have_manifests?`) skips most commits cheaply. On a typical codebase, only 10-20% of commits touch files that could be manifests.
+
+Blob OID caching helps when the same manifest content appears across multiple commits. The cache stats show hit rates.
+
+## Example Output
+
+```
+Full pipeline benchmark: 500 commits
+============================================================
+
+Full pipeline breakdown:
+------------------------------------------------------------
+  git_diff           0.892s  (12.3%)
+  filtering          0.234s  (3.2%)
+  parsing            4.521s  (62.4%)
+  db_writes          1.602s  (22.1%)
+------------------------------------------------------------
+  Total              7.249s
+
+Throughput: 69.0 commits/sec
+Cache stats: {:cached_blobs=>142, :blobs_with_hits=>89}
+```
diff --git a/git-pkgs.gemspec b/git-pkgs.gemspec
index 87c2272..a51e697 100644
--- a/git-pkgs.gemspec
+++ b/git-pkgs.gemspec
@@ -23,7 +23,8 @@ Gem::Specification.new do |spec|
   spec.files = IO.popen(%w[git ls-files -z], chdir: __dir__, err: IO::NULL) do |ls|
     ls.readlines("\x0", chomp: true).reject do |f|
       (f == gemspec) ||
-        f.start_with?(*%w[bin/ Gemfile .gitignore test/ .github/])
+        f.start_with?(*%w[bin/ Gemfile .gitignore test/ .github/ docs/ benchmark/]) ||
+        f.end_with?(*%w[Rakefile CODE_OF_CONDUCT.md CONTRIBUTING.md SECURITY.md])
     end
   end
   spec.bindir = "exe"