From 5a57efc02911731bbf13e3f5c713ca2fb010afee Mon Sep 17 00:00:00 2001 From: Andrew Nesbitt Date: Sun, 4 Jan 2026 16:48:46 +0000 Subject: [PATCH] Add benchmark scripts and documentation --- CONTRIBUTING.md | 2 +- benchmark_bulk.rb => benchmark/bulk.rb | 0 benchmark/commands.rb | 91 ++++++++++++++++++ benchmark_db.rb => benchmark/db.rb | 0 .../detailed.rb | 0 benchmark_full.rb => benchmark/full.rb | 0 bin/benchmark | 30 ++++++ docs/README.md | 9 ++ docs/benchmarking.md | 94 +++++++++++++++++++ git-pkgs.gemspec | 3 +- 10 files changed, 227 insertions(+), 2 deletions(-) rename benchmark_bulk.rb => benchmark/bulk.rb (100%) create mode 100644 benchmark/commands.rb rename benchmark_db.rb => benchmark/db.rb (100%) rename benchmark_detailed.rb => benchmark/detailed.rb (100%) rename benchmark_full.rb => benchmark/full.rb (100%) create mode 100755 bin/benchmark create mode 100644 docs/README.md create mode 100644 docs/benchmarking.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cebe98d..d6332e5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -9,7 +9,7 @@ The database schema stores: - Dependency changes (added/modified/removed) with before/after versions - Periodic snapshots of full dependency state for efficient point-in-time queries -See [docs/internals.md](docs/internals.md) for a detailed architecture overview and [docs/schema.md](docs/schema.md) for the database schema. +See the [docs](docs/) folder for architecture details, database schema, and benchmarking tools. Since the database is just SQLite, you can query it directly for ad-hoc analysis: diff --git a/benchmark_bulk.rb b/benchmark/bulk.rb similarity index 100% rename from benchmark_bulk.rb rename to benchmark/bulk.rb diff --git a/benchmark/commands.rb b/benchmark/commands.rb new file mode 100644 index 0000000..dd759fd --- /dev/null +++ b/benchmark/commands.rb @@ -0,0 +1,91 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require "benchmark" +require "optparse" + +options = { + iterations: 3, + repo: nil +} + +OptionParser.new do |opts| + opts.banner = "Usage: bin/benchmark commands [options]" + + opts.on("-r", "--repo=PATH", "Path to repository to benchmark against") do |v| + options[:repo] = v + end + + opts.on("-n", "--iterations=N", Integer, "Number of iterations per command (default: 3)") do |v| + options[:iterations] = v + end + + opts.on("-h", "--help", "Show this help") do + puts opts + exit + end +end.parse! + +unless options[:repo] + puts "Error: --repo is required" + puts "Usage: bin/benchmark commands --repo /path/to/repo" + exit 1 +end + +repo_path = File.expand_path(options[:repo]) +unless File.directory?(repo_path) + puts "Error: #{repo_path} is not a directory" + exit 1 +end + +unless File.exist?(File.join(repo_path, ".git", "pkgs.sqlite3")) + puts "Error: #{repo_path} does not have a git-pkgs database" + puts "Run 'git pkgs init' in that repository first" + exit 1 +end + +iterations = options[:iterations] +gem_root = File.expand_path("../..", __FILE__) + +# Use bundle exec to ensure we run the local development version +commands = { + "blame" => "bundle exec --gemfile=#{gem_root}/Gemfile ruby -I#{gem_root}/lib #{gem_root}/exe/git-pkgs blame --no-pager", + "stale" => "bundle exec --gemfile=#{gem_root}/Gemfile ruby -I#{gem_root}/lib #{gem_root}/exe/git-pkgs stale --no-pager", + "stats" => "bundle exec --gemfile=#{gem_root}/Gemfile ruby -I#{gem_root}/lib #{gem_root}/exe/git-pkgs stats --no-pager", + "log" => "bundle exec --gemfile=#{gem_root}/Gemfile ruby -I#{gem_root}/lib #{gem_root}/exe/git-pkgs log --no-pager", + "list" => "bundle exec --gemfile=#{gem_root}/Gemfile ruby -I#{gem_root}/lib #{gem_root}/exe/git-pkgs list --no-pager" +} + +puts "Command benchmarks" +puts "=" * 60 +puts "Repository: #{repo_path}" +puts "Iterations: #{iterations}" +puts + +results = {} + +Dir.chdir(repo_path) do + commands.each do |name, cmd| + times = [] + + # Warmup run + system(cmd, out: File::NULL, err: File::NULL) + + iterations.times do + time = Benchmark.realtime do + system(cmd, out: File::NULL, err: File::NULL) + end + times << time + end + + avg = times.sum / times.size + min = times.min + max = times.max + results[name] = { avg: avg, min: min, max: max } + + puts format("%-10s avg: %6.3fs min: %6.3fs max: %6.3fs", name, avg, min, max) + end +end + +puts +puts "Total average: #{format("%.3fs", results.values.sum { |r| r[:avg] })}" diff --git a/benchmark_db.rb b/benchmark/db.rb similarity index 100% rename from benchmark_db.rb rename to benchmark/db.rb diff --git a/benchmark_detailed.rb b/benchmark/detailed.rb similarity index 100% rename from benchmark_detailed.rb rename to benchmark/detailed.rb diff --git a/benchmark_full.rb b/benchmark/full.rb similarity index 100% rename from benchmark_full.rb rename to benchmark/full.rb diff --git a/bin/benchmark b/bin/benchmark new file mode 100755 index 0000000..1ab7a1e --- /dev/null +++ b/bin/benchmark @@ -0,0 +1,30 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +BENCHMARKS = { + "full" => "Full pipeline benchmark", + "detailed" => "Detailed phase breakdown", + "bulk" => "Bulk insert benchmark", + "db" => "DB operation breakdown", + "commands" => "CLI command benchmarks" +}.freeze + +def usage + puts "Usage: bin/benchmark [repo_path] [sample_size]" + puts " bin/benchmark commands --repo /path/to/repo [-n iterations]" + puts + puts "Types:" + BENCHMARKS.each do |name, desc| + puts " #{name.ljust(10)} #{desc}" + end + puts + puts "Example: bin/benchmark full /path/to/repo 500" + exit 1 +end + +type = ARGV.shift +usage if type.nil? || type == "-h" || type == "--help" +usage unless BENCHMARKS.key?(type) + +script = File.expand_path("../benchmark/#{type}.rb", __dir__) +exec("ruby", script, *ARGV) diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..6f302d5 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,9 @@ +# Documentation + +Technical documentation for git-pkgs maintainers and contributors. + +- [internals.md](internals.md) - Architecture overview, how commands work, key algorithms +- [schema.md](schema.md) - Database tables and relationships +- [benchmarking.md](benchmarking.md) - Performance profiling tools + +For user-facing documentation, see the main [README](../README.md). diff --git a/docs/benchmarking.md b/docs/benchmarking.md new file mode 100644 index 0000000..892ecd4 --- /dev/null +++ b/docs/benchmarking.md @@ -0,0 +1,94 @@ +# Benchmarking + +git-pkgs includes benchmark scripts for profiling performance. Run them with: + +```bash +bin/benchmark [repo_path] [sample_size] +``` + +The default repo is `/Users/andrew/code/octobox` and sample size is 500 commits. + +## Benchmark Types + +### full + +Full pipeline benchmark with phase breakdown: + +```bash +bin/benchmark full /path/to/repo 500 +``` + +Measures time spent in each phase: git diff extraction, manifest filtering, parsing, and database writes. Reports overall throughput in commits/sec. + +### detailed + +Granular breakdown of each processing step: + +```bash +bin/benchmark detailed /path/to/repo 500 +``` + +Shows timing for blob path extraction, regex pre-filtering, bibliothecary identification, and manifest parsing. Also breaks down parsing time by platform (rubygems, npm, etc.) and reports how many commits pass each filter stage. + +### bulk + +Compares data collection vs bulk insert performance: + +```bash +bin/benchmark bulk /path/to/repo 500 +``` + +Separates the time spent analyzing commits from the time spent writing to the database. Uses `insert_all` for bulk operations. Helps identify whether bottlenecks are in git/parsing or database writes. + +### db + +Individual database operation timing: + +```bash +bin/benchmark db /path/to/repo 200 +``` + +Measures each ActiveRecord operation separately: commit creation, branch_commit creation, manifest lookups, change inserts, and snapshot inserts. Shows per-operation averages in milliseconds. + +### commands + +End-to-end CLI command benchmarks: + +```bash +bin/benchmark commands --repo /path/to/repo -n 3 +``` + +Runs actual git-pkgs commands (`blame`, `stale`, `stats`, `log`, `list`) against a repo with an existing database. Measures wall-clock time over multiple iterations. Useful for regression testing command performance. + +The repo must already have a database from `git pkgs init`. + +## Interpreting Results + +The main bottlenecks are typically: + +1. **Git blob reads** - extracting file contents from commits +2. **Bibliothecary parsing** - parsing manifest file contents +3. **Database writes** - inserting records (mitigated by bulk inserts) + +The regex pre-filter (`might_have_manifests?`) skips most commits cheaply. On a typical codebase, only 10-20% of commits touch files that could be manifests. + +Blob OID caching helps when the same manifest content appears across multiple commits. The cache stats show hit rates. + +## Example Output + +``` +Full pipeline benchmark: 500 commits +============================================================ + +Full pipeline breakdown: +------------------------------------------------------------ + git_diff 0.892s (12.3%) + filtering 0.234s (3.2%) + parsing 4.521s (62.4%) + db_writes 1.602s (22.1%) +------------------------------------------------------------ + Total 7.249s + +Throughput: 69.0 commits/sec +Cache stats: {:cached_blobs=>142, :blobs_with_hits=>89} +``` diff --git a/git-pkgs.gemspec b/git-pkgs.gemspec index 87c2272..a51e697 100644 --- a/git-pkgs.gemspec +++ b/git-pkgs.gemspec @@ -23,7 +23,8 @@ Gem::Specification.new do |spec| spec.files = IO.popen(%w[git ls-files -z], chdir: __dir__, err: IO::NULL) do |ls| ls.readlines("\x0", chomp: true).reject do |f| (f == gemspec) || - f.start_with?(*%w[bin/ Gemfile .gitignore test/ .github/]) + f.start_with?(*%w[bin/ Gemfile .gitignore test/ .github/ docs/ benchmark/]) || + f.end_with?(*%w[Rakefile CODE_OF_CONDUCT.md CONTRIBUTING.md SECURITY.md]) end end spec.bindir = "exe"