From 79a7e7f63f598acade7c975ee397edfa53823c42 Mon Sep 17 00:00:00 2001
From: Steve Gerbino <steve@gerbino.co>
Date: Wed, 4 Feb 2026 14:27:14 +0100
Subject: [PATCH 1/2] Consolidate benchmarks into single executables

Combine four separate benchmark executables into one for each library:
- asio_bench: unified Asio benchmarks with --category and --bench filters
- corosio_bench: unified Corosio benchmarks with --backend, --category, and --bench filters

Extract make_socket_pair into shared socket_utils.hpp to reduce duplication.
---
 bench/asio/CMakeLists.txt                 |  43 +--
 bench/asio/benchmarks.hpp                 |  59 ++++
 bench/asio/http_server_bench.cpp          | 391 ++++++++-------------
 bench/asio/io_context_bench.cpp           | 270 +++++---------
 bench/asio/main.cpp                       | 138 ++++++++
 bench/asio/socket_latency_bench.cpp       | 256 ++++----------
 bench/asio/socket_throughput_bench.cpp    | 294 +++++-----------
 bench/asio/socket_utils.hpp               |  44 +++
 bench/corosio/CMakeLists.txt              |  31 +-
 bench/corosio/benchmarks.hpp              |  63 ++++
 bench/corosio/http_server_bench.cpp       | 410 ++++++++--------------
 bench/corosio/io_context_bench.cpp        | 334 ++++++------------
 bench/corosio/main.cpp                    | 175 +++++++++
 bench/corosio/socket_latency_bench.cpp    | 286 +++++----------
 bench/corosio/socket_throughput_bench.cpp | 321 ++++++-----------
 15 files changed, 1385 insertions(+), 1730 deletions(-)
 create mode 100644 bench/asio/benchmarks.hpp
 create mode 100644 bench/asio/main.cpp
 create mode 100644 bench/asio/socket_utils.hpp
 create mode 100644 bench/corosio/benchmarks.hpp
 create mode 100644 bench/corosio/main.cpp

diff --git a/bench/asio/CMakeLists.txt b/bench/asio/CMakeLists.txt
index f68d705d..fd563510 100644
--- a/bench/asio/CMakeLists.txt
+++ b/bench/asio/CMakeLists.txt
@@ -8,25 +8,28 @@
 # Official repository: https://github.com/cppalliance/corosio
 #
 
-# Asio benchmark executables for comparison
+# Asio benchmark executable for comparison
 
-function(asio_add_benchmark name source)
-    add_executable(${name} ${source})
-    target_link_libraries(${name}
-        PRIVATE
-            Boost::asio
-            Threads::Threads)
-    target_compile_features(${name} PUBLIC cxx_std_20)
-    target_compile_options(${name}
-        PRIVATE
-            $<$<CXX_COMPILER_ID:GNU>:-fcoroutines>)
-    set_property(TARGET ${name} PROPERTY FOLDER "benchmarks/asio")
-    if (COROSIO_BENCH_LTO_SUPPORTED)
-        set_property(TARGET ${name} PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
-    endif ()
-endfunction()
+add_executable(asio_bench
+    main.cpp
+    io_context_bench.cpp
+    socket_throughput_bench.cpp
+    socket_latency_bench.cpp
+    http_server_bench.cpp)
 
-asio_add_benchmark(asio_bench_io_context io_context_bench.cpp)
-asio_add_benchmark(asio_bench_socket_throughput socket_throughput_bench.cpp)
-asio_add_benchmark(asio_bench_socket_latency socket_latency_bench.cpp)
-asio_add_benchmark(asio_bench_http_server http_server_bench.cpp)
+target_link_libraries(asio_bench
+    PRIVATE
+        Boost::asio
+        Threads::Threads)
+
+target_compile_features(asio_bench PUBLIC cxx_std_20)
+
+target_compile_options(asio_bench
+    PRIVATE
+        $<$<CXX_COMPILER_ID:GNU>:-fcoroutines>)
+
+set_property(TARGET asio_bench PROPERTY FOLDER "benchmarks/asio")
+
+if (COROSIO_BENCH_LTO_SUPPORTED)
+    set_property(TARGET asio_bench PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
+endif ()
diff --git a/bench/asio/benchmarks.hpp b/bench/asio/benchmarks.hpp
new file mode 100644
index 00000000..17557f50
--- /dev/null
+++ b/bench/asio/benchmarks.hpp
@@ -0,0 +1,59 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/corosio
+//
+
+#ifndef ASIO_BENCH_BENCHMARKS_HPP
+#define ASIO_BENCH_BENCHMARKS_HPP
+
+#include "../common/benchmark.hpp"
+
+namespace asio_bench {
+
+/** Run io_context benchmarks.
+
+    @param collector Results collector.
+    @param filter Optional filter: nullptr or "all" runs all, or a specific
+           benchmark name (single_threaded, multithreaded, interleaved, concurrent).
+*/
+void run_io_context_benchmarks(
+    bench::result_collector& collector,
+    char const* filter );
+
+/** Run socket throughput benchmarks.
+
+    @param collector Results collector.
+    @param filter Optional filter: nullptr or "all" runs all, or a specific
+           benchmark name (unidirectional, bidirectional).
+*/
+void run_socket_throughput_benchmarks(
+    bench::result_collector& collector,
+    char const* filter );
+
+/** Run socket latency benchmarks.
+
+    @param collector Results collector.
+    @param filter Optional filter: nullptr or "all" runs all, or a specific
+           benchmark name (pingpong, concurrent).
+*/
+void run_socket_latency_benchmarks(
+    bench::result_collector& collector,
+    char const* filter );
+
+/** Run HTTP server benchmarks.
+
+    @param collector Results collector.
+    @param filter Optional filter: nullptr or "all" runs all, or a specific
+           benchmark name (single_conn, concurrent, multithread).
+*/
+void run_http_server_benchmarks(
+    bench::result_collector& collector,
+    char const* filter );
+
+} // namespace asio_bench
+
+#endif
diff --git a/bench/asio/http_server_bench.cpp b/bench/asio/http_server_bench.cpp
index cfb5b5c4..17da83f1 100644
--- a/bench/asio/http_server_bench.cpp
+++ b/bench/asio/http_server_bench.cpp
@@ -7,8 +7,9 @@
 // Official repository: https://github.com/cppalliance/corosio
 //
 
-#include <boost/asio/io_context.hpp>
-#include <boost/asio/ip/tcp.hpp>
+#include "benchmarks.hpp"
+#include "socket_utils.hpp"
+
 #include <boost/asio/co_spawn.hpp>
 #include <boost/asio/detached.hpp>
 #include <boost/asio/awaitable.hpp>
@@ -28,167 +29,138 @@
 #include "../common/benchmark.hpp"
 #include "../common/http_protocol.hpp"
 
-namespace asio = boost::asio;
-using tcp = asio::ip::tcp;
-
-// Create a connected socket pair using TCP loopback
-std::pair<tcp::socket, tcp::socket> make_socket_pair(asio::io_context& ioc)
-{
-    tcp::acceptor acceptor(ioc, tcp::endpoint(tcp::v4(), 0));
-    acceptor.set_option(tcp::acceptor::reuse_address(true));
-
-    tcp::socket client(ioc);
-    tcp::socket server(ioc);
-
-    auto endpoint = acceptor.local_endpoint();
-    client.connect(tcp::endpoint(asio::ip::address_v4::loopback(), endpoint.port()));
-    server = acceptor.accept();
+namespace asio_bench {
+namespace {
 
-    client.set_option(tcp::no_delay(true));
-    server.set_option(tcp::no_delay(true));
-
-    return {std::move(client), std::move(server)};
-}
-
-// Server coroutine: reads requests and sends responses
 asio::awaitable<void> server_task(
     tcp::socket& sock,
     int num_requests,
-    int& completed_requests)
+    int& completed_requests )
 {
     std::string buf;
 
     try
     {
-        while (completed_requests < num_requests)
+        while( completed_requests < num_requests )
         {
-            // Read until end of HTTP headers
             std::size_t n = co_await asio::async_read_until(
                 sock,
-                asio::dynamic_buffer(buf),
+                asio::dynamic_buffer( buf ),
                 "\r\n\r\n",
-                asio::use_awaitable);
+                asio::use_awaitable );
 
-            // Send response
             co_await asio::async_write(
                 sock,
-                asio::buffer(bench::http::small_response, bench::http::small_response_size),
-                asio::use_awaitable);
+                asio::buffer( bench::http::small_response, bench::http::small_response_size ),
+                asio::use_awaitable );
 
             ++completed_requests;
-            buf.erase(0, n);
+            buf.erase( 0, n );
         }
     }
-    catch (std::exception const&) {}
+    catch( std::exception const& ) {}
 }
 
-// Client coroutine: sends requests and reads responses
 asio::awaitable<void> client_task(
     tcp::socket& sock,
     int num_requests,
-    bench::statistics& latency_stats)
+    bench::statistics& latency_stats )
 {
     std::string buf;
 
     try
     {
-        for (int i = 0; i < num_requests; ++i)
+        for( int i = 0; i < num_requests; ++i )
         {
             bench::stopwatch sw;
 
-            // Send request
             co_await asio::async_write(
                 sock,
-                asio::buffer(bench::http::small_request, bench::http::small_request_size),
-                asio::use_awaitable);
+                asio::buffer( bench::http::small_request, bench::http::small_request_size ),
+                asio::use_awaitable );
 
-            // Read response headers
             std::size_t header_end = co_await asio::async_read_until(
                 sock,
-                asio::dynamic_buffer(buf),
+                asio::dynamic_buffer( buf ),
                 "\r\n\r\n",
-                asio::use_awaitable);
+                asio::use_awaitable );
 
-            // Parse Content-Length from headers and read body if needed
-            std::string_view headers(buf.data(), header_end);
+            std::string_view headers( buf.data(), header_end );
             std::size_t content_length = 0;
-            auto pos = headers.find("Content-Length: ");
-            if (pos != std::string_view::npos)
+            auto pos = headers.find( "Content-Length: " );
+            if( pos != std::string_view::npos )
             {
                 pos += 16;
-                while (pos < headers.size() && headers[pos] >= '0' && headers[pos] <= '9')
+                while( pos < headers.size() && headers[pos] >= '0' && headers[pos] <= '9' )
                 {
-                    content_length = content_length * 10 + (headers[pos] - '0');
+                    content_length = content_length * 10 + ( headers[pos] - '0' );
                     ++pos;
                 }
             }
 
-            // Read body if not already in buffer
             std::size_t total_size = header_end + content_length;
-            if (buf.size() < total_size)
+            if( buf.size() < total_size )
             {
                 std::size_t need = total_size - buf.size();
                 std::size_t old_size = buf.size();
-                buf.resize(total_size);
+                buf.resize( total_size );
                 co_await asio::async_read(
                     sock,
-                    asio::buffer(buf.data() + old_size, need),
-                    asio::use_awaitable);
+                    asio::buffer( buf.data() + old_size, need ),
+                    asio::use_awaitable );
             }
 
             double latency_us = sw.elapsed_us();
-            latency_stats.add(latency_us);
+            latency_stats.add( latency_us );
 
-            buf.erase(0, total_size);
+            buf.erase( 0, total_size );
         }
     }
-    catch (std::exception const&) {}
+    catch( std::exception const& ) {}
 }
 
-// Single connection benchmark
-bench::benchmark_result bench_single_connection(int num_requests)
+bench::benchmark_result bench_single_connection( int num_requests )
 {
     std::cout << "  Requests: " << num_requests << "\n";
 
     asio::io_context ioc;
-    auto [client, server] = make_socket_pair(ioc);
+    auto [client, server] = make_socket_pair( ioc );
 
     int completed_requests = 0;
     bench::statistics latency_stats;
 
     bench::stopwatch total_sw;
 
-    asio::co_spawn(ioc,
-        server_task(server, num_requests, completed_requests),
-        asio::detached);
-    asio::co_spawn(ioc,
-        client_task(client, num_requests, latency_stats),
-        asio::detached);
+    asio::co_spawn( ioc,
+        server_task( server, num_requests, completed_requests ),
+        asio::detached );
+    asio::co_spawn( ioc,
+        client_task( client, num_requests, latency_stats ),
+        asio::detached );
 
     ioc.run();
 
     double elapsed = total_sw.elapsed_seconds();
-    double requests_per_sec = static_cast<double>(num_requests) / elapsed;
+    double requests_per_sec = static_cast<double>( num_requests ) / elapsed;
 
     std::cout << "    Completed: " << num_requests << " requests\n";
-    std::cout << "    Elapsed: " << std::fixed << std::setprecision(3)
+    std::cout << "    Elapsed: " << std::fixed << std::setprecision( 3 )
               << elapsed << " s\n";
-    std::cout << "    Throughput: " << bench::format_rate(requests_per_sec) << "\n";
-    bench::print_latency_stats(latency_stats, "Request latency");
+    std::cout << "    Throughput: " << bench::format_rate( requests_per_sec ) << "\n";
+    bench::print_latency_stats( latency_stats, "Request latency" );
     std::cout << "\n";
 
     client.close();
     server.close();
 
-    return bench::benchmark_result("single_conn")
-        .add("num_requests", num_requests)
-        .add("num_connections", 1)
-        .add("requests_per_sec", requests_per_sec)
-        .add_latency_stats("request_latency", latency_stats);
+    return bench::benchmark_result( "single_conn" )
+        .add( "num_requests", num_requests )
+        .add( "num_connections", 1 )
+        .add( "requests_per_sec", requests_per_sec )
+        .add_latency_stats( "request_latency", latency_stats );
 }
 
-// Concurrent connections benchmark
-bench::benchmark_result bench_concurrent_connections(int num_connections, int requests_per_conn)
+bench::benchmark_result bench_concurrent_connections( int num_connections, int requests_per_conn )
 {
     int total_requests = num_connections * requests_per_conn;
     std::cout << "  Connections: " << num_connections
@@ -199,70 +171,68 @@ bench::benchmark_result bench_concurrent_connections(int num_connections, int re
 
     std::vector<tcp::socket> clients;
     std::vector<tcp::socket> servers;
-    std::vector<int> completed(num_connections, 0);
-    std::vector<bench::statistics> stats(num_connections);
+    std::vector<int> completed( num_connections, 0 );
+    std::vector<bench::statistics> stats( num_connections );
 
-    clients.reserve(num_connections);
-    servers.reserve(num_connections);
+    clients.reserve( num_connections );
+    servers.reserve( num_connections );
 
-    for (int i = 0; i < num_connections; ++i)
+    for( int i = 0; i < num_connections; ++i )
     {
-        auto [c, s] = make_socket_pair(ioc);
-        clients.push_back(std::move(c));
-        servers.push_back(std::move(s));
+        auto [c, s] = make_socket_pair( ioc );
+        clients.push_back( std::move( c ) );
+        servers.push_back( std::move( s ) );
     }
 
     bench::stopwatch total_sw;
 
-    for (int i = 0; i < num_connections; ++i)
+    for( int i = 0; i < num_connections; ++i )
     {
-        asio::co_spawn(ioc,
-            server_task(servers[i], requests_per_conn, completed[i]),
-            asio::detached);
-        asio::co_spawn(ioc,
-            client_task(clients[i], requests_per_conn, stats[i]),
-            asio::detached);
+        asio::co_spawn( ioc,
+            server_task( servers[i], requests_per_conn, completed[i] ),
+            asio::detached );
+        asio::co_spawn( ioc,
+            client_task( clients[i], requests_per_conn, stats[i] ),
+            asio::detached );
     }
 
     ioc.run();
 
     double elapsed = total_sw.elapsed_seconds();
-    double requests_per_sec = static_cast<double>(total_requests) / elapsed;
+    double requests_per_sec = static_cast<double>( total_requests ) / elapsed;
 
-    // Aggregate latency stats
     double total_mean = 0;
     double total_p99 = 0;
-    for (auto& s : stats)
+    for( auto& s : stats )
     {
         total_mean += s.mean();
         total_p99 += s.p99();
     }
 
     std::cout << "    Completed: " << total_requests << " requests\n";
-    std::cout << "    Elapsed: " << std::fixed << std::setprecision(3)
+    std::cout << "    Elapsed: " << std::fixed << std::setprecision( 3 )
               << elapsed << " s\n";
-    std::cout << "    Throughput: " << bench::format_rate(requests_per_sec) << "\n";
+    std::cout << "    Throughput: " << bench::format_rate( requests_per_sec ) << "\n";
     std::cout << "    Avg mean latency: "
-              << bench::format_latency(total_mean / num_connections) << "\n";
+              << bench::format_latency( total_mean / num_connections ) << "\n";
     std::cout << "    Avg p99 latency: "
-              << bench::format_latency(total_p99 / num_connections) << "\n\n";
+              << bench::format_latency( total_p99 / num_connections ) << "\n\n";
 
-    for (auto& c : clients)
+    for( auto& c : clients )
         c.close();
-    for (auto& s : servers)
+    for( auto& s : servers )
         s.close();
 
-    return bench::benchmark_result("concurrent_" + std::to_string(num_connections))
-        .add("num_connections", num_connections)
-        .add("requests_per_conn", requests_per_conn)
-        .add("total_requests", total_requests)
-        .add("requests_per_sec", requests_per_sec)
-        .add("avg_mean_latency_us", total_mean / num_connections)
-        .add("avg_p99_latency_us", total_p99 / num_connections);
+    return bench::benchmark_result( "concurrent_" + std::to_string( num_connections ) )
+        .add( "num_connections", num_connections )
+        .add( "requests_per_conn", requests_per_conn )
+        .add( "total_requests", total_requests )
+        .add( "requests_per_sec", requests_per_sec )
+        .add( "avg_mean_latency_us", total_mean / num_connections )
+        .add( "avg_p99_latency_us", total_p99 / num_connections );
 }
 
-// Multi-threaded benchmark: multiple threads calling run()
-bench::benchmark_result bench_multithread(int num_threads, int num_connections, int requests_per_conn)
+bench::benchmark_result bench_multithread( int num_threads, int num_connections, int requests_per_conn )
 {
     int total_requests = num_connections * requests_per_conn;
     std::cout << "  Threads: " << num_threads
@@ -270,192 +240,117 @@ bench::benchmark_result bench_multithread(int num_threads, int num_connections,
               << ", Requests per connection: " << requests_per_conn
               << ", Total: " << total_requests << "\n";
 
-    asio::io_context ioc(num_threads);
+    asio::io_context ioc( num_threads );
 
     std::vector<tcp::socket> clients;
     std::vector<tcp::socket> servers;
-    std::vector<int> completed(num_connections, 0);
-    std::vector<bench::statistics> stats(num_connections);
+    std::vector<int> completed( num_connections, 0 );
+    std::vector<bench::statistics> stats( num_connections );
 
-    clients.reserve(num_connections);
-    servers.reserve(num_connections);
+    clients.reserve( num_connections );
+    servers.reserve( num_connections );
 
-    for (int i = 0; i < num_connections; ++i)
+    for( int i = 0; i < num_connections; ++i )
     {
-        auto [c, s] = make_socket_pair(ioc);
-        clients.push_back(std::move(c));
-        servers.push_back(std::move(s));
+        auto [c, s] = make_socket_pair( ioc );
+        clients.push_back( std::move( c ) );
+        servers.push_back( std::move( s ) );
     }
 
-    // Spawn all coroutines before starting threads
-    for (int i = 0; i < num_connections; ++i)
+    for( int i = 0; i < num_connections; ++i )
     {
-        asio::co_spawn(ioc,
-            server_task(servers[i], requests_per_conn, completed[i]),
-            asio::detached);
-        asio::co_spawn(ioc,
-            client_task(clients[i], requests_per_conn, stats[i]),
-            asio::detached);
+        asio::co_spawn( ioc,
+            server_task( servers[i], requests_per_conn, completed[i] ),
+            asio::detached );
+        asio::co_spawn( ioc,
+            client_task( clients[i], requests_per_conn, stats[i] ),
+            asio::detached );
     }
 
     bench::stopwatch total_sw;
 
-    // Launch worker threads
     std::vector<std::thread> threads;
-    threads.reserve(num_threads - 1);
-    for (int i = 1; i < num_threads; ++i)
-        threads.emplace_back([&ioc] { ioc.run(); });
+    threads.reserve( num_threads - 1 );
+    for( int i = 1; i < num_threads; ++i )
+        threads.emplace_back( [&ioc] { ioc.run(); } );
 
-    // Main thread also runs
     ioc.run();
 
-    // Wait for all threads
-    for (auto& t : threads)
+    for( auto& t : threads )
         t.join();
 
     double elapsed = total_sw.elapsed_seconds();
-    double requests_per_sec = static_cast<double>(total_requests) / elapsed;
+    double requests_per_sec = static_cast<double>( total_requests ) / elapsed;
 
-    // Aggregate latency stats
     double total_mean = 0;
     double total_p99 = 0;
-    for (auto& s : stats)
+    for( auto& s : stats )
     {
         total_mean += s.mean();
         total_p99 += s.p99();
     }
 
     std::cout << "    Completed: " << total_requests << " requests\n";
-    std::cout << "    Elapsed: " << std::fixed << std::setprecision(3)
+    std::cout << "    Elapsed: " << std::fixed << std::setprecision( 3 )
               << elapsed << " s\n";
-    std::cout << "    Throughput: " << bench::format_rate(requests_per_sec) << "\n";
+    std::cout << "    Throughput: " << bench::format_rate( requests_per_sec ) << "\n";
     std::cout << "    Avg mean latency: "
-              << bench::format_latency(total_mean / num_connections) << "\n";
+              << bench::format_latency( total_mean / num_connections ) << "\n";
     std::cout << "    Avg p99 latency: "
-              << bench::format_latency(total_p99 / num_connections) << "\n\n";
+              << bench::format_latency( total_p99 / num_connections ) << "\n\n";
 
-    for (auto& c : clients)
+    for( auto& c : clients )
         c.close();
-    for (auto& s : servers)
+    for( auto& s : servers )
         s.close();
 
-    return bench::benchmark_result("multithread_" + std::to_string(num_threads) + "t")
-        .add("num_threads", num_threads)
-        .add("num_connections", num_connections)
-        .add("requests_per_conn", requests_per_conn)
-        .add("total_requests", total_requests)
-        .add("requests_per_sec", requests_per_sec)
-        .add("avg_mean_latency_us", total_mean / num_connections)
-        .add("avg_p99_latency_us", total_p99 / num_connections);
+    return bench::benchmark_result( "multithread_" + std::to_string( num_threads ) + "t" )
+        .add( "num_threads", num_threads )
+        .add( "num_connections", num_connections )
+        .add( "requests_per_conn", requests_per_conn )
+        .add( "total_requests", total_requests )
+        .add( "requests_per_sec", requests_per_sec )
+        .add( "avg_mean_latency_us", total_mean / num_connections )
+        .add( "avg_p99_latency_us", total_p99 / num_connections );
 }
 
-void run_benchmarks(char const* output_file, char const* bench_filter)
-{
-    std::cout << "Boost.Asio HTTP Server Benchmarks\n";
-    std::cout << "=================================\n";
-
-    bench::result_collector collector("asio");
+} // anonymous namespace
 
-    bool run_all = !bench_filter || std::strcmp(bench_filter, "all") == 0;
+void run_http_server_benchmarks(
+    bench::result_collector& collector,
+    char const* filter )
+{
+    std::cout << "\n>>> HTTP Server Benchmarks (Asio) <<<\n";
 
-    if (run_all || std::strcmp(bench_filter, "single_conn") == 0)
-    {
-        bench::print_header("Single Connection (Sequential Requests)");
-        collector.add(bench_single_connection(10000));
-    }
+    bool run_all = !filter || std::strcmp( filter, "all" ) == 0;
 
-    if (run_all || std::strcmp(bench_filter, "concurrent") == 0)
+    if( run_all || std::strcmp( filter, "single_conn" ) == 0 )
     {
-        if (run_all)
-            std::this_thread::sleep_for(std::chrono::seconds(5));
-        bench::print_header("Concurrent Connections");
-        collector.add(bench_concurrent_connections(1, 10000));
-        collector.add(bench_concurrent_connections(4, 2500));
-        collector.add(bench_concurrent_connections(16, 625));
-        collector.add(bench_concurrent_connections(32, 312));
+        bench::print_header( "Single Connection (Sequential Requests)" );
+        collector.add( bench_single_connection( 10000 ) );
     }
 
-    if (run_all || std::strcmp(bench_filter, "multithread") == 0)
+    if( run_all || std::strcmp( filter, "concurrent" ) == 0 )
     {
-        if (run_all)
-            std::this_thread::sleep_for(std::chrono::seconds(5));
-        bench::print_header("Multi-threaded (32 connections, varying threads)");
-        collector.add(bench_multithread(1, 32, 312));
-        collector.add(bench_multithread(2, 32, 312));
-        collector.add(bench_multithread(4, 32, 312));
-        collector.add(bench_multithread(8, 32, 312));
+        if( run_all )
+            std::this_thread::sleep_for( std::chrono::seconds( 5 ) );
+        bench::print_header( "Concurrent Connections" );
+        collector.add( bench_concurrent_connections( 1, 10000 ) );
+        collector.add( bench_concurrent_connections( 4, 2500 ) );
+        collector.add( bench_concurrent_connections( 16, 625 ) );
+        collector.add( bench_concurrent_connections( 32, 312 ) );
     }
 
-    std::cout << "\nBenchmarks complete.\n";
-
-    if (output_file)
+    if( run_all || std::strcmp( filter, "multithread" ) == 0 )
     {
-        if (collector.write_json(output_file))
-            std::cout << "Results written to: " << output_file << "\n";
-        else
-            std::cerr << "Error: Failed to write results to: " << output_file << "\n";
+        if( run_all )
+            std::this_thread::sleep_for( std::chrono::seconds( 5 ) );
+        bench::print_header( "Multi-threaded (32 connections, varying threads)" );
+        collector.add( bench_multithread( 1, 32, 312 ) );
+        collector.add( bench_multithread( 2, 32, 312 ) );
+        collector.add( bench_multithread( 4, 32, 312 ) );
+        collector.add( bench_multithread( 8, 32, 312 ) );
     }
 }
 
-void print_usage(char const* program_name)
-{
-    std::cout << "Usage: " << program_name << " [OPTIONS]\n\n";
-    std::cout << "Options:\n";
-    std::cout << "  --bench <name>     Run only the specified benchmark\n";
-    std::cout << "  --output <file>    Write JSON results to file\n";
-    std::cout << "  --help             Show this help message\n";
-    std::cout << "\n";
-    std::cout << "Available benchmarks:\n";
-    std::cout << "  single_conn        Single connection, sequential requests\n";
-    std::cout << "  concurrent         Multiple concurrent connections\n";
-    std::cout << "  multithread        Multi-threaded with varying thread counts\n";
-    std::cout << "  all                Run all benchmarks (default)\n";
-}
-
-int main(int argc, char* argv[])
-{
-    char const* output_file = nullptr;
-    char const* bench_filter = nullptr;
-
-    for (int i = 1; i < argc; ++i)
-    {
-        if (std::strcmp(argv[i], "--bench") == 0)
-        {
-            if (i + 1 < argc)
-            {
-                bench_filter = argv[++i];
-            }
-            else
-            {
-                std::cerr << "Error: --bench requires an argument\n";
-                return 1;
-            }
-        }
-        else if (std::strcmp(argv[i], "--output") == 0)
-        {
-            if (i + 1 < argc)
-            {
-                output_file = argv[++i];
-            }
-            else
-            {
-                std::cerr << "Error: --output requires an argument\n";
-                return 1;
-            }
-        }
-        else if (std::strcmp(argv[i], "--help") == 0 || std::strcmp(argv[i], "-h") == 0)
-        {
-            print_usage(argv[0]);
-            return 0;
-        }
-        else
-        {
-            std::cerr << "Unknown option: " << argv[i] << "\n";
-            print_usage(argv[0]);
-            return 1;
-        }
-    }
-
-    run_benchmarks(output_file, bench_filter);
-    return 0;
-}
+} // namespace asio_bench
diff --git a/bench/asio/io_context_bench.cpp b/bench/asio/io_context_bench.cpp
index 616645c6..987768bd 100644
--- a/bench/asio/io_context_bench.cpp
+++ b/bench/asio/io_context_bench.cpp
@@ -7,8 +7,7 @@
 // Official repository: https://github.com/cppalliance/corosio
 //
 
-// This benchmark uses coroutines (like Corosio) for a fair comparison,
-// rather than plain callbacks.
+#include "benchmarks.hpp"
 
 #include <boost/asio/io_context.hpp>
 #include <boost/asio/co_spawn.hpp>
@@ -26,108 +25,100 @@
 
 namespace asio = boost::asio;
 
-// Coroutine that increments a counter
-asio::awaitable<void> increment_task(int& counter)
+namespace asio_bench {
+namespace {
+
+asio::awaitable<void> increment_task( int& counter )
 {
     ++counter;
     co_return;
 }
 
-// Coroutine that increments an atomic counter
-asio::awaitable<void> atomic_increment_task(std::atomic<int>& counter)
+asio::awaitable<void> atomic_increment_task( std::atomic<int>& counter )
 {
-    counter.fetch_add(1, std::memory_order_relaxed);
+    counter.fetch_add( 1, std::memory_order_relaxed );
     co_return;
 }
 
-// Measures single-threaded coroutine throughput using Asio's awaitable/co_spawn.
-// This is a direct apples-to-apples comparison with Corosio since both use C++20
-// coroutines. Differences reveal the overhead of each framework's coroutine
-// integration rather than callback vs. coroutine differences.
-bench::benchmark_result bench_single_threaded_post(int num_handlers)
+bench::benchmark_result bench_single_threaded_post( int num_handlers )
 {
-    bench::print_header("Single-threaded Handler Post (Asio)");
+    bench::print_header( "Single-threaded Handler Post (Asio)" );
 
     asio::io_context ioc;
     int counter = 0;
 
     bench::stopwatch sw;
 
-    for (int i = 0; i < num_handlers; ++i)
-        asio::co_spawn(ioc, increment_task(counter), asio::detached);
+    for( int i = 0; i < num_handlers; ++i )
+        asio::co_spawn( ioc, increment_task( counter ), asio::detached );
 
     ioc.run();
 
     double elapsed = sw.elapsed_seconds();
-    double ops_per_sec = static_cast<double>(num_handlers) / elapsed;
+    double ops_per_sec = static_cast<double>( num_handlers ) / elapsed;
 
     std::cout << "  Handlers:    " << num_handlers << "\n";
-    std::cout << "  Elapsed:     " << std::fixed << std::setprecision(3)
+    std::cout << "  Elapsed:     " << std::fixed << std::setprecision( 3 )
               << elapsed << " s\n";
-    std::cout << "  Throughput:  " << bench::format_rate(ops_per_sec) << "\n";
+    std::cout << "  Throughput:  " << bench::format_rate( ops_per_sec ) << "\n";
 
-    if (counter != num_handlers)
+    if( counter != num_handlers )
     {
         std::cerr << "  ERROR: counter mismatch! Expected " << num_handlers
                   << ", got " << counter << "\n";
     }
 
-    return bench::benchmark_result("single_threaded_post")
-        .add("handlers", num_handlers)
-        .add("elapsed_s", elapsed)
-        .add("ops_per_sec", ops_per_sec);
+    return bench::benchmark_result( "single_threaded_post" )
+        .add( "handlers", num_handlers )
+        .add( "elapsed_s", elapsed )
+        .add( "ops_per_sec", ops_per_sec );
 }
 
-// Measures multi-threaded scaling using Asio coroutines. Tests how Asio's
-// scheduler handles coroutine resumption across threads. Compare against Corosio
-// to evaluate coroutine dispatch efficiency under thread contention.
-bench::benchmark_result bench_multithreaded_scaling(int num_handlers, int max_threads)
+bench::benchmark_result bench_multithreaded_scaling( int num_handlers, int max_threads )
 {
-    bench::print_header("Multi-threaded Scaling (Asio Coroutines)");
+    bench::print_header( "Multi-threaded Scaling (Asio Coroutines)" );
 
     std::cout << "  Handlers per test: " << num_handlers << "\n\n";
 
-    bench::benchmark_result result("multithreaded_scaling");
-    result.add("handlers", num_handlers);
+    bench::benchmark_result result( "multithreaded_scaling" );
+    result.add( "handlers", num_handlers );
 
     double baseline_ops = 0;
 
-    for (int num_threads = 1; num_threads <= max_threads; num_threads *= 2)
+    for( int num_threads = 1; num_threads <= max_threads; num_threads *= 2 )
     {
         asio::io_context ioc;
-        std::atomic<int> counter{0};
+        std::atomic<int> counter{ 0 };
 
-        // Post all coroutines first
-        for (int i = 0; i < num_handlers; ++i)
-            asio::co_spawn(ioc, atomic_increment_task(counter), asio::detached);
+        for( int i = 0; i < num_handlers; ++i )
+            asio::co_spawn( ioc, atomic_increment_task( counter ), asio::detached );
 
         bench::stopwatch sw;
 
-        // Run with multiple threads
         std::vector<std::thread> runners;
-        for (int t = 0; t < num_threads; ++t)
-            runners.emplace_back([&ioc]() { ioc.run(); });
+        for( int t = 0; t < num_threads; ++t )
+            runners.emplace_back( [&ioc]() { ioc.run(); } );
 
-        for (auto& t : runners)
+        for( auto& t : runners )
             t.join();
 
         double elapsed = sw.elapsed_seconds();
-        double ops_per_sec = static_cast<double>(num_handlers) / elapsed;
+        double ops_per_sec = static_cast<double>( num_handlers ) / elapsed;
 
         std::cout << "  " << num_threads << " thread(s): "
-                  << bench::format_rate(ops_per_sec);
+                  << bench::format_rate( ops_per_sec );
 
-        if (num_threads == 1)
+        if( num_threads == 1 )
             baseline_ops = ops_per_sec;
-        else if (baseline_ops > 0)
-            std::cout << " (speedup: " << std::fixed << std::setprecision(2)
-                      << (ops_per_sec / baseline_ops) << "x)";
+        else if( baseline_ops > 0 )
+            std::cout << " (speedup: " << std::fixed << std::setprecision( 2 )
+                      << ( ops_per_sec / baseline_ops ) << "x)";
 
         std::cout << "\n";
 
-        result.add("threads_" + std::to_string(num_threads) + "_ops_per_sec", ops_per_sec);
+        result.add( "threads_" + std::to_string( num_threads ) + "_ops_per_sec", ops_per_sec );
 
-        if (counter.load() != num_handlers)
+        if( counter.load() != num_handlers )
         {
             std::cerr << "  ERROR: counter mismatch! Expected " << num_handlers
                       << ", got " << counter.load() << "\n";
@@ -137,12 +128,9 @@ bench::benchmark_result bench_multithreaded_scaling(int num_handlers, int max_th
     return result;
 }
 
-// Measures poll() efficiency with Asio coroutines in a game-loop pattern.
-// Tests how Asio handles frequent context restarts with coroutine-based work.
-// Compare against Corosio for latency-sensitive polling scenarios.
-bench::benchmark_result bench_interleaved_post_run(int iterations, int handlers_per_iteration)
+bench::benchmark_result bench_interleaved_post_run( int iterations, int handlers_per_iteration )
 {
-    bench::print_header("Interleaved Post/Run (Asio Coroutines)");
+    bench::print_header( "Interleaved Post/Run (Asio Coroutines)" );
 
     asio::io_context ioc;
     int counter = 0;
@@ -150,197 +138,119 @@ bench::benchmark_result bench_interleaved_post_run(int iterations, int handlers_
 
     bench::stopwatch sw;
 
-    for (int iter = 0; iter < iterations; ++iter)
+    for( int iter = 0; iter < iterations; ++iter )
     {
-        for (int i = 0; i < handlers_per_iteration; ++i)
-            asio::co_spawn(ioc, increment_task(counter), asio::detached);
+        for( int i = 0; i < handlers_per_iteration; ++i )
+            asio::co_spawn( ioc, increment_task( counter ), asio::detached );
 
         ioc.poll();
         ioc.restart();
     }
 
-    // Run any remaining handlers
     ioc.run();
 
     double elapsed = sw.elapsed_seconds();
-    double ops_per_sec = static_cast<double>(total_handlers) / elapsed;
+    double ops_per_sec = static_cast<double>( total_handlers ) / elapsed;
 
     std::cout << "  Iterations:        " << iterations << "\n";
     std::cout << "  Handlers/iter:     " << handlers_per_iteration << "\n";
     std::cout << "  Total handlers:    " << total_handlers << "\n";
-    std::cout << "  Elapsed:           " << std::fixed << std::setprecision(3)
+    std::cout << "  Elapsed:           " << std::fixed << std::setprecision( 3 )
               << elapsed << " s\n";
-    std::cout << "  Throughput:        " << bench::format_rate(ops_per_sec) << "\n";
+    std::cout << "  Throughput:        " << bench::format_rate( ops_per_sec ) << "\n";
 
-    if (counter != total_handlers)
+    if( counter != total_handlers )
     {
         std::cerr << "  ERROR: counter mismatch! Expected " << total_handlers
                   << ", got " << counter << "\n";
     }
 
-    return bench::benchmark_result("interleaved_post_run")
-        .add("iterations", iterations)
-        .add("handlers_per_iteration", handlers_per_iteration)
-        .add("total_handlers", total_handlers)
-        .add("elapsed_s", elapsed)
-        .add("ops_per_sec", ops_per_sec);
+    return bench::benchmark_result( "interleaved_post_run" )
+        .add( "iterations", iterations )
+        .add( "handlers_per_iteration", handlers_per_iteration )
+        .add( "total_handlers", total_handlers )
+        .add( "elapsed_s", elapsed )
+        .add( "ops_per_sec", ops_per_sec );
 }
 
-// Measures Asio coroutine performance under concurrent producer-consumer load.
-// Multiple threads spawn and execute coroutines simultaneously. Compare against
-// Corosio to evaluate coroutine dispatch under realistic server workloads.
-bench::benchmark_result bench_concurrent_post_run(int num_threads, int handlers_per_thread)
+bench::benchmark_result bench_concurrent_post_run( int num_threads, int handlers_per_thread )
 {
-    bench::print_header("Concurrent Post and Run (Asio Coroutines)");
+    bench::print_header( "Concurrent Post and Run (Asio Coroutines)" );
 
     asio::io_context ioc;
-    std::atomic<int> counter{0};
+    std::atomic<int> counter{ 0 };
     int total_handlers = num_threads * handlers_per_thread;
 
     bench::stopwatch sw;
 
-    // Launch threads that both post and run
     std::vector<std::thread> workers;
-    for (int t = 0; t < num_threads; ++t)
+    for( int t = 0; t < num_threads; ++t )
     {
-        workers.emplace_back([&ioc, &counter, handlers_per_thread]()
+        workers.emplace_back( [&ioc, &counter, handlers_per_thread]()
         {
-            for (int i = 0; i < handlers_per_thread; ++i)
-                asio::co_spawn(ioc, atomic_increment_task(counter), asio::detached);
+            for( int i = 0; i < handlers_per_thread; ++i )
+                asio::co_spawn( ioc, atomic_increment_task( counter ), asio::detached );
             ioc.run();
-        });
+        } );
     }
 
-    for (auto& t : workers)
+    for( auto& t : workers )
         t.join();
 
     double elapsed = sw.elapsed_seconds();
-    double ops_per_sec = static_cast<double>(total_handlers) / elapsed;
+    double ops_per_sec = static_cast<double>( total_handlers ) / elapsed;
 
     std::cout << "  Threads:           " << num_threads << "\n";
     std::cout << "  Handlers/thread:   " << handlers_per_thread << "\n";
     std::cout << "  Total handlers:    " << total_handlers << "\n";
-    std::cout << "  Elapsed:           " << std::fixed << std::setprecision(3)
+    std::cout << "  Elapsed:           " << std::fixed << std::setprecision( 3 )
               << elapsed << " s\n";
-    std::cout << "  Throughput:        " << bench::format_rate(ops_per_sec) << "\n";
+    std::cout << "  Throughput:        " << bench::format_rate( ops_per_sec ) << "\n";
 
-    if (counter.load() != total_handlers)
+    if( counter.load() != total_handlers )
     {
         std::cerr << "  ERROR: counter mismatch! Expected " << total_handlers
                   << ", got " << counter.load() << "\n";
     }
 
-    return bench::benchmark_result("concurrent_post_run")
-        .add("threads", num_threads)
-        .add("handlers_per_thread", handlers_per_thread)
-        .add("total_handlers", total_handlers)
-        .add("elapsed_s", elapsed)
-        .add("ops_per_sec", ops_per_sec);
+    return bench::benchmark_result( "concurrent_post_run" )
+        .add( "threads", num_threads )
+        .add( "handlers_per_thread", handlers_per_thread )
+        .add( "total_handlers", total_handlers )
+        .add( "elapsed_s", elapsed )
+        .add( "ops_per_sec", ops_per_sec );
 }
 
-// Run benchmarks
-void run_benchmarks(const char* output_file, const char* bench_filter)
-{
-    std::cout << "Boost.Asio io_context Benchmarks\n";
-    std::cout << "=================================\n\n";
+} // anonymous namespace
 
-    bench::result_collector collector("asio");
+void run_io_context_benchmarks(
+    bench::result_collector& collector,
+    char const* filter )
+{
+    std::cout << "\n>>> io_context Benchmarks (Asio) <<<\n";
 
-    bool run_all = !bench_filter || std::strcmp(bench_filter, "all") == 0;
+    bool run_all = !filter || std::strcmp( filter, "all" ) == 0;
 
     // Warm up
     {
         asio::io_context ioc;
         int counter = 0;
-        for (int i = 0; i < 1000; ++i)
-            asio::co_spawn(ioc, increment_task(counter), asio::detached);
+        for( int i = 0; i < 1000; ++i )
+            asio::co_spawn( ioc, increment_task( counter ), asio::detached );
         ioc.run();
     }
 
-    // Run selected benchmarks
-    if (run_all || std::strcmp(bench_filter, "single_threaded") == 0)
-        collector.add(bench_single_threaded_post(1000000));
-
-    if (run_all || std::strcmp(bench_filter, "multithreaded") == 0)
-        collector.add(bench_multithreaded_scaling(1000000, 8));
-
-    if (run_all || std::strcmp(bench_filter, "interleaved") == 0)
-        collector.add(bench_interleaved_post_run(10000, 100));
-
-    if (run_all || std::strcmp(bench_filter, "concurrent") == 0)
-        collector.add(bench_concurrent_post_run(4, 250000));
+    if( run_all || std::strcmp( filter, "single_threaded" ) == 0 )
+        collector.add( bench_single_threaded_post( 1000000 ) );
 
-    std::cout << "\nBenchmarks complete.\n";
+    if( run_all || std::strcmp( filter, "multithreaded" ) == 0 )
+        collector.add( bench_multithreaded_scaling( 1000000, 8 ) );
 
-    if (output_file)
-    {
-        if (collector.write_json(output_file))
-            std::cout << "Results written to: " << output_file << "\n";
-        else
-            std::cerr << "Error: Failed to write results to: " << output_file << "\n";
-    }
-}
+    if( run_all || std::strcmp( filter, "interleaved" ) == 0 )
+        collector.add( bench_interleaved_post_run( 10000, 100 ) );
 
-void print_usage(const char* program_name)
-{
-    std::cout << "Usage: " << program_name << " [OPTIONS]\n\n";
-    std::cout << "Options:\n";
-    std::cout << "  --bench <name>     Run only the specified benchmark\n";
-    std::cout << "  --output <file>    Write JSON results to file\n";
-    std::cout << "  --help             Show this help message\n";
-    std::cout << "\n";
-    std::cout << "Available benchmarks:\n";
-    std::cout << "  single_threaded    Single-threaded coroutine post throughput\n";
-    std::cout << "  multithreaded      Multi-threaded scaling test\n";
-    std::cout << "  interleaved        Interleaved post/poll pattern\n";
-    std::cout << "  concurrent         Concurrent post and run\n";
-    std::cout << "  all                Run all benchmarks (default)\n";
+    if( run_all || std::strcmp( filter, "concurrent" ) == 0 )
+        collector.add( bench_concurrent_post_run( 4, 250000 ) );
 }
 
-int main(int argc, char* argv[])
-{
-    const char* output_file = nullptr;
-    const char* bench_filter = nullptr;
-
-    for (int i = 1; i < argc; ++i)
-    {
-        if (std::strcmp(argv[i], "--bench") == 0)
-        {
-            if (i + 1 < argc)
-            {
-                bench_filter = argv[++i];
-            }
-            else
-            {
-                std::cerr << "Error: --bench requires an argument\n";
-                return 1;
-            }
-        }
-        else if (std::strcmp(argv[i], "--output") == 0)
-        {
-            if (i + 1 < argc)
-            {
-                output_file = argv[++i];
-            }
-            else
-            {
-                std::cerr << "Error: --output requires an argument\n";
-                return 1;
-            }
-        }
-        else if (std::strcmp(argv[i], "--help") == 0 || std::strcmp(argv[i], "-h") == 0)
-        {
-            print_usage(argv[0]);
-            return 0;
-        }
-        else
-        {
-            std::cerr << "Unknown option: " << argv[i] << "\n";
-            print_usage(argv[0]);
-            return 1;
-        }
-    }
-
-    run_benchmarks(output_file, bench_filter);
-    return 0;
-}
+} // namespace asio_bench
diff --git a/bench/asio/main.cpp b/bench/asio/main.cpp
new file mode 100644
index 00000000..c124cb21
--- /dev/null
+++ b/bench/asio/main.cpp
@@ -0,0 +1,138 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/corosio
+//
+
+#include "benchmarks.hpp"
+
+#include <cstring>
+#include <iostream>
+
+#include "../common/benchmark.hpp"
+
+namespace {
+
+void run_benchmarks(
+    char const* output_file,
+    char const* category_filter,
+    char const* bench_filter )
+{
+    std::cout << "Boost.Asio Benchmarks\n";
+    std::cout << "=====================\n";
+
+    bench::result_collector collector( "asio" );
+
+    bool run_all = !category_filter || std::strcmp( category_filter, "all" ) == 0;
+
+    if( run_all || std::strcmp( category_filter, "io_context" ) == 0 )
+        asio_bench::run_io_context_benchmarks( collector, bench_filter );
+
+    if( run_all || std::strcmp( category_filter, "socket_throughput" ) == 0 )
+        asio_bench::run_socket_throughput_benchmarks( collector, bench_filter );
+
+    if( run_all || std::strcmp( category_filter, "socket_latency" ) == 0 )
+        asio_bench::run_socket_latency_benchmarks( collector, bench_filter );
+
+    if( run_all || std::strcmp( category_filter, "http_server" ) == 0 )
+        asio_bench::run_http_server_benchmarks( collector, bench_filter );
+
+    std::cout << "\nBenchmarks complete.\n";
+
+    if( output_file )
+    {
+        if( collector.write_json( output_file ) )
+            std::cout << "Results written to: " << output_file << "\n";
+        else
+            std::cerr << "Error: Failed to write results to: " << output_file << "\n";
+    }
+}
+
+void print_usage( char const* program_name )
+{
+    std::cout << "Usage: " << program_name << " [OPTIONS]\n\n";
+    std::cout << "Options:\n";
+    std::cout << "  --category <name>   Run only the specified benchmark category\n";
+    std::cout << "  --bench <name>      Run only the specified benchmark within category\n";
+    std::cout << "  --output <file>     Write JSON results to file\n";
+    std::cout << "  --help              Show this help message\n";
+    std::cout << "\n";
+    std::cout << "Benchmark categories:\n";
+    std::cout << "  io_context          io_context handler throughput tests\n";
+    std::cout << "  socket_throughput   Socket throughput tests\n";
+    std::cout << "  socket_latency      Socket latency tests\n";
+    std::cout << "  http_server         HTTP server benchmarks\n";
+    std::cout << "  all                 Run all categories (default)\n";
+    std::cout << "\n";
+    std::cout << "Individual benchmarks (--bench):\n";
+    std::cout << "  io_context:         single_threaded, multithreaded, interleaved, concurrent\n";
+    std::cout << "  socket_throughput:  unidirectional, bidirectional\n";
+    std::cout << "  socket_latency:     pingpong, concurrent\n";
+    std::cout << "  http_server:        single_conn, concurrent, multithread\n";
+}
+
+} // anonymous namespace
+
+int main( int argc, char* argv[] )
+{
+    char const* output_file = nullptr;
+    char const* category_filter = nullptr;
+    char const* bench_filter = nullptr;
+
+    for( int i = 1; i < argc; ++i )
+    {
+        if( std::strcmp( argv[i], "--category" ) == 0 )
+        {
+            if( i + 1 < argc )
+            {
+                category_filter = argv[++i];
+            }
+            else
+            {
+                std::cerr << "Error: --category requires an argument\n";
+                return 1;
+            }
+        }
+        else if( std::strcmp( argv[i], "--bench" ) == 0 )
+        {
+            if( i + 1 < argc )
+            {
+                bench_filter = argv[++i];
+            }
+            else
+            {
+                std::cerr << "Error: --bench requires an argument\n";
+                return 1;
+            }
+        }
+        else if( std::strcmp( argv[i], "--output" ) == 0 )
+        {
+            if( i + 1 < argc )
+            {
+                output_file = argv[++i];
+            }
+            else
+            {
+                std::cerr << "Error: --output requires an argument\n";
+                return 1;
+            }
+        }
+        else if( std::strcmp( argv[i], "--help" ) == 0 || std::strcmp( argv[i], "-h" ) == 0 )
+        {
+            print_usage( argv[0] );
+            return 0;
+        }
+        else
+        {
+            std::cerr << "Unknown option: " << argv[i] << "\n";
+            print_usage( argv[0] );
+            return 1;
+        }
+    }
+
+    run_benchmarks( output_file, category_filter, bench_filter );
+    return 0;
+}
diff --git a/bench/asio/socket_latency_bench.cpp b/bench/asio/socket_latency_bench.cpp
index 51f52d39..a8a73453 100644
--- a/bench/asio/socket_latency_bench.cpp
+++ b/bench/asio/socket_latency_bench.cpp
@@ -7,8 +7,9 @@
 // Official repository: https://github.com/cppalliance/corosio
 //
 
-#include <boost/asio/io_context.hpp>
-#include <boost/asio/ip/tcp.hpp>
+#include "benchmarks.hpp"
+#include "socket_utils.hpp"
+
 #include <boost/asio/co_spawn.hpp>
 #include <boost/asio/detached.hpp>
 #include <boost/asio/awaitable.hpp>
@@ -23,113 +24,80 @@
 
 #include "../common/benchmark.hpp"
 
-namespace asio = boost::asio;
-using tcp = asio::ip::tcp;
-
-// Create a connected socket pair using TCP loopback
-std::pair<tcp::socket, tcp::socket> make_socket_pair(asio::io_context& ioc)
-{
-    tcp::acceptor acceptor(ioc, tcp::endpoint(tcp::v4(), 0));
-    acceptor.set_option(tcp::acceptor::reuse_address(true));
-
-    tcp::socket client(ioc);
-    tcp::socket server(ioc);
-
-    auto endpoint = acceptor.local_endpoint();
-    client.connect(tcp::endpoint(asio::ip::address_v4::loopback(), endpoint.port()));
-    server = acceptor.accept();
-
-    // Disable Nagle's algorithm for low latency
-    client.set_option(tcp::no_delay(true));
-    server.set_option(tcp::no_delay(true));
-
-    return {std::move(client), std::move(server)};
-}
+namespace asio_bench {
+namespace {
 
-// Ping-pong coroutine task
 asio::awaitable<void> pingpong_task(
     tcp::socket& client,
     tcp::socket& server,
     std::size_t message_size,
     int iterations,
-    bench::statistics& stats)
+    bench::statistics& stats )
 {
-    std::vector<char> send_buf(message_size, 'P');
-    std::vector<char> recv_buf(message_size);
+    std::vector<char> send_buf( message_size, 'P' );
+    std::vector<char> recv_buf( message_size );
 
     try
     {
-        for (int i = 0; i < iterations; ++i)
+        for( int i = 0; i < iterations; ++i )
         {
             bench::stopwatch sw;
 
-            // Client sends ping
             co_await asio::async_write(
                 client,
-                asio::buffer(send_buf.data(), send_buf.size()),
-                asio::use_awaitable);
+                asio::buffer( send_buf.data(), send_buf.size() ),
+                asio::use_awaitable );
 
-            // Server receives ping
             co_await asio::async_read(
                 server,
-                asio::buffer(recv_buf.data(), recv_buf.size()),
-                asio::use_awaitable);
+                asio::buffer( recv_buf.data(), recv_buf.size() ),
+                asio::use_awaitable );
 
-            // Server sends pong
             co_await asio::async_write(
                 server,
-                asio::buffer(recv_buf.data(), recv_buf.size()),
-                asio::use_awaitable);
+                asio::buffer( recv_buf.data(), recv_buf.size() ),
+                asio::use_awaitable );
 
-            // Client receives pong
             co_await asio::async_read(
                 client,
-                asio::buffer(recv_buf.data(), recv_buf.size()),
-                asio::use_awaitable);
+                asio::buffer( recv_buf.data(), recv_buf.size() ),
+                asio::use_awaitable );
 
             double rtt_us = sw.elapsed_us();
-            stats.add(rtt_us);
+            stats.add( rtt_us );
         }
     }
-    catch (std::exception const&) {}
+    catch( std::exception const& ) {}
 }
 
-// Measures Asio's round-trip latency for request-response patterns. Uses coroutines
-// for fair comparison with Corosio. Reports mean and tail latencies (p99, p99.9).
-// Compare against Corosio to evaluate which framework achieves lower latency for
-// RPC-style protocols.
-bench::benchmark_result bench_pingpong_latency(std::size_t message_size, int iterations)
+bench::benchmark_result bench_pingpong_latency( std::size_t message_size, int iterations )
 {
     std::cout << "  Message size: " << message_size << " bytes, ";
     std::cout << "Iterations: " << iterations << "\n";
 
     asio::io_context ioc;
-    auto [client, server] = make_socket_pair(ioc);
+    auto [client, server] = make_socket_pair( ioc );
 
     bench::statistics latency_stats;
 
-    asio::co_spawn(ioc,
-        pingpong_task(client, server, message_size, iterations, latency_stats),
-        asio::detached);
+    asio::co_spawn( ioc,
+        pingpong_task( client, server, message_size, iterations, latency_stats ),
+        asio::detached );
     ioc.run();
 
-    bench::print_latency_stats(latency_stats, "Round-trip latency");
+    bench::print_latency_stats( latency_stats, "Round-trip latency" );
     std::cout << "\n";
 
     client.close();
     server.close();
 
-    return bench::benchmark_result("pingpong_" + std::to_string(message_size))
-        .add("message_size", static_cast<double>(message_size))
-        .add("iterations", iterations)
-        .add_latency_stats("rtt", latency_stats);
+    return bench::benchmark_result( "pingpong_" + std::to_string( message_size ) )
+        .add( "message_size", static_cast<double>( message_size ) )
+        .add( "iterations", iterations )
+        .add_latency_stats( "rtt", latency_stats );
 }
 
-// Measures Asio's latency degradation under concurrent connection load. Multiple
-// socket pairs perform ping-pong simultaneously. Compare against Corosio to
-// evaluate which framework maintains lower latency as connection count increases.
-// Critical for understanding scalability limits.
-bench::benchmark_result bench_concurrent_latency(int num_pairs, std::size_t message_size, int iterations)
+bench::benchmark_result bench_concurrent_latency( int num_pairs, std::size_t message_size, int iterations )
 {
     std::cout << "  Concurrent pairs: " << num_pairs << ", ";
     std::cout << "Message size: " << message_size << " bytes, ";
@@ -137,166 +105,92 @@ bench::benchmark_result bench_concurrent_latency(int num_pairs, std::size_t mess
 
     asio::io_context ioc;
 
-    // Store sockets and stats separately for safe reference passing
     std::vector<tcp::socket> clients;
     std::vector<tcp::socket> servers;
-    std::vector<bench::statistics> stats(num_pairs);
+    std::vector<bench::statistics> stats( num_pairs );
 
-    clients.reserve(num_pairs);
-    servers.reserve(num_pairs);
+    clients.reserve( num_pairs );
+    servers.reserve( num_pairs );
 
-    for (int i = 0; i < num_pairs; ++i)
+    for( int i = 0; i < num_pairs; ++i )
     {
-        auto [c, s] = make_socket_pair(ioc);
-        clients.push_back(std::move(c));
-        servers.push_back(std::move(s));
+        auto [c, s] = make_socket_pair( ioc );
+        clients.push_back( std::move( c ) );
+        servers.push_back( std::move( s ) );
     }
 
-    // Launch concurrent ping-pong tasks
-    for (int p = 0; p < num_pairs; ++p)
+    for( int p = 0; p < num_pairs; ++p )
     {
-        asio::co_spawn(ioc,
-            pingpong_task(clients[p], servers[p], message_size, iterations, stats[p]),
-            asio::detached);
+        asio::co_spawn( ioc,
+            pingpong_task( clients[p], servers[p], message_size, iterations, stats[p] ),
+            asio::detached );
     }
 
     ioc.run();
 
     std::cout << "  Per-pair results:\n";
-    for (int i = 0; i < num_pairs && i < 3; ++i)
+    for( int i = 0; i < num_pairs && i < 3; ++i )
     {
         std::cout << "    Pair " << i << ": mean="
-                  << bench::format_latency(stats[i].mean())
-                  << ", p99=" << bench::format_latency(stats[i].p99())
+                  << bench::format_latency( stats[i].mean() )
+                  << ", p99=" << bench::format_latency( stats[i].p99() )
                   << "\n";
     }
-    if (num_pairs > 3)
-        std::cout << "    ... (" << (num_pairs - 3) << " more pairs)\n";
+    if( num_pairs > 3 )
+        std::cout << "    ... (" << ( num_pairs - 3 ) << " more pairs)\n";
 
-    // Calculate average across all pairs
     double total_mean = 0;
     double total_p99 = 0;
-    for (auto& s : stats)
+    for( auto& s : stats )
     {
         total_mean += s.mean();
         total_p99 += s.p99();
     }
     std::cout << "  Average mean latency: "
-              << bench::format_latency(total_mean / num_pairs) << "\n";
+              << bench::format_latency( total_mean / num_pairs ) << "\n";
     std::cout << "  Average p99 latency:  "
-              << bench::format_latency(total_p99 / num_pairs) << "\n\n";
+              << bench::format_latency( total_p99 / num_pairs ) << "\n\n";
 
-    for (auto& c : clients)
+    for( auto& c : clients )
         c.close();
-    for (auto& s : servers)
+    for( auto& s : servers )
         s.close();
 
-    return bench::benchmark_result("concurrent_" + std::to_string(num_pairs) + "_pairs")
-        .add("num_pairs", num_pairs)
-        .add("message_size", static_cast<double>(message_size))
-        .add("iterations", iterations)
-        .add("avg_mean_latency_us", total_mean / num_pairs)
-        .add("avg_p99_latency_us", total_p99 / num_pairs);
+    return bench::benchmark_result( "concurrent_" + std::to_string( num_pairs ) + "_pairs" )
+        .add( "num_pairs", num_pairs )
+        .add( "message_size", static_cast<double>( message_size ) )
+        .add( "iterations", iterations )
+        .add( "avg_mean_latency_us", total_mean / num_pairs )
+        .add( "avg_p99_latency_us", total_p99 / num_pairs );
 }
 
-// Run benchmarks
-void run_benchmarks(const char* output_file, const char* bench_filter)
-{
-    std::cout << "Boost.Asio Socket Latency Benchmarks\n";
-    std::cout << "====================================\n";
+} // anonymous namespace
 
-    bench::result_collector collector("asio");
+void run_socket_latency_benchmarks(
+    bench::result_collector& collector,
+    char const* filter )
+{
+    std::cout << "\n>>> Socket Latency Benchmarks (Asio) <<<\n";
 
-    bool run_all = !bench_filter || std::strcmp(bench_filter, "all") == 0;
+    bool run_all = !filter || std::strcmp( filter, "all" ) == 0;
 
-    // Variable message sizes
-    std::vector<std::size_t> message_sizes = {1, 64, 1024};
+    std::vector<std::size_t> message_sizes = { 1, 64, 1024 };
     int iterations = 1000;
 
-    if (run_all || std::strcmp(bench_filter, "pingpong") == 0)
-    {
-        bench::print_header("Ping-Pong Round-Trip Latency (Asio)");
-        for (auto size : message_sizes)
-            collector.add(bench_pingpong_latency(size, iterations));
-    }
-
-    if (run_all || std::strcmp(bench_filter, "concurrent") == 0)
+    if( run_all || std::strcmp( filter, "pingpong" ) == 0 )
     {
-        bench::print_header("Concurrent Socket Pairs Latency (Asio)");
-        collector.add(bench_concurrent_latency(1, 64, 1000));
-        collector.add(bench_concurrent_latency(4, 64, 500));
-        collector.add(bench_concurrent_latency(16, 64, 250));
+        bench::print_header( "Ping-Pong Round-Trip Latency (Asio)" );
+        for( auto size : message_sizes )
+            collector.add( bench_pingpong_latency( size, iterations ) );
     }
 
-    std::cout << "\nBenchmarks complete.\n";
-
-    if (output_file)
+    if( run_all || std::strcmp( filter, "concurrent" ) == 0 )
     {
-        if (collector.write_json(output_file))
-            std::cout << "Results written to: " << output_file << "\n";
-        else
-            std::cerr << "Error: Failed to write results to: " << output_file << "\n";
+        bench::print_header( "Concurrent Socket Pairs Latency (Asio)" );
+        collector.add( bench_concurrent_latency( 1, 64, 1000 ) );
+        collector.add( bench_concurrent_latency( 4, 64, 500 ) );
+        collector.add( bench_concurrent_latency( 16, 64, 250 ) );
     }
 }
 
-void print_usage(const char* program_name)
-{
-    std::cout << "Usage: " << program_name << " [OPTIONS]\n\n";
-    std::cout << "Options:\n";
-    std::cout << "  --bench <name>     Run only the specified benchmark\n";
-    std::cout << "  --output <file>    Write JSON results to file\n";
-    std::cout << "  --help             Show this help message\n";
-    std::cout << "\n";
-    std::cout << "Available benchmarks:\n";
-    std::cout << "  pingpong           Ping-pong round-trip latency (various message sizes)\n";
-    std::cout << "  concurrent         Concurrent socket pairs latency\n";
-    std::cout << "  all                Run all benchmarks (default)\n";
-}
-
-int main(int argc, char* argv[])
-{
-    const char* output_file = nullptr;
-    const char* bench_filter = nullptr;
-
-    for (int i = 1; i < argc; ++i)
-    {
-        if (std::strcmp(argv[i], "--bench") == 0)
-        {
-            if (i + 1 < argc)
-            {
-                bench_filter = argv[++i];
-            }
-            else
-            {
-                std::cerr << "Error: --bench requires an argument\n";
-                return 1;
-            }
-        }
-        else if (std::strcmp(argv[i], "--output") == 0)
-        {
-            if (i + 1 < argc)
-            {
-                output_file = argv[++i];
-            }
-            else
-            {
-                std::cerr << "Error: --output requires an argument\n";
-                return 1;
-            }
-        }
-        else if (std::strcmp(argv[i], "--help") == 0 || std::strcmp(argv[i], "-h") == 0)
-        {
-            print_usage(argv[0]);
-            return 0;
-        }
-        else
-        {
-            std::cerr << "Unknown option: " << argv[i] << "\n";
-            print_usage(argv[0]);
-            return 1;
-        }
-    }
-
-    run_benchmarks(output_file, bench_filter);
-    return 0;
-}
+} // namespace asio_bench
diff --git a/bench/asio/socket_throughput_bench.cpp b/bench/asio/socket_throughput_bench.cpp
index 59918efb..c2c51df3 100644
--- a/bench/asio/socket_throughput_bench.cpp
+++ b/bench/asio/socket_throughput_bench.cpp
@@ -7,8 +7,9 @@
 // Official repository: https://github.com/cppalliance/corosio
 //
 
-#include <boost/asio/io_context.hpp>
-#include <boost/asio/ip/tcp.hpp>
+#include "benchmarks.hpp"
+#include "socket_utils.hpp"
+
 #include <boost/asio/co_spawn.hpp>
 #include <boost/asio/detached.hpp>
 #include <boost/asio/awaitable.hpp>
@@ -16,7 +17,6 @@
 #include <boost/asio/write.hpp>
 #include <boost/asio/read.hpp>
 #include <boost/asio/buffer.hpp>
-#include <boost/asio/connect.hpp>
 
 #include <cstring>
 #include <iostream>
@@ -24,329 +24,225 @@
 
 #include "../common/benchmark.hpp"
 
-namespace asio = boost::asio;
-using tcp = asio::ip::tcp;
-
-// Create a connected socket pair using TCP loopback
-std::pair<tcp::socket, tcp::socket> make_socket_pair(asio::io_context& ioc)
-{
-    tcp::acceptor acceptor(ioc, tcp::endpoint(tcp::v4(), 0));
-    acceptor.set_option(tcp::acceptor::reuse_address(true));
-
-    tcp::socket client(ioc);
-    tcp::socket server(ioc);
-
-    auto endpoint = acceptor.local_endpoint();
-    client.connect(tcp::endpoint(asio::ip::address_v4::loopback(), endpoint.port()));
-    server = acceptor.accept();
+namespace asio_bench {
+namespace {
 
-    // Disable Nagle's algorithm for low latency
-    client.set_option(tcp::no_delay(true));
-    server.set_option(tcp::no_delay(true));
-
-    return {std::move(client), std::move(server)};
-}
-
-// Measures Asio's unidirectional socket throughput over loopback. Uses coroutines
-// for fair comparison with Corosio. Tests async I/O efficiency across different
-// buffer sizes. Compare against Corosio to evaluate which framework achieves
-// higher throughput for streaming workloads.
-bench::benchmark_result bench_throughput(std::size_t chunk_size, std::size_t total_bytes)
+bench::benchmark_result bench_throughput( std::size_t chunk_size, std::size_t total_bytes )
 {
     std::cout << "  Buffer size: " << chunk_size << " bytes, ";
-    std::cout << "Transfer: " << (total_bytes / (1024 * 1024)) << " MB\n";
+    std::cout << "Transfer: " << ( total_bytes / ( 1024 * 1024 ) ) << " MB\n";
 
     asio::io_context ioc;
-    auto [writer, reader] = make_socket_pair(ioc);
+    auto [writer, reader] = make_socket_pair( ioc );
 
-    std::vector<char> write_buf(chunk_size, 'x');
-    std::vector<char> read_buf(chunk_size);
+    std::vector<char> write_buf( chunk_size, 'x' );
+    std::vector<char> read_buf( chunk_size );
 
     std::size_t total_written = 0;
     std::size_t total_read = 0;
 
-    // Writer coroutine
     auto write_task = [&]() -> asio::awaitable<void>
     {
         try
         {
-            while (total_written < total_bytes)
+            while( total_written < total_bytes )
             {
-                std::size_t to_write = (std::min)(chunk_size, total_bytes - total_written);
+                std::size_t to_write = ( std::min )( chunk_size, total_bytes - total_written );
                 auto n = co_await writer.async_write_some(
-                    asio::buffer(write_buf.data(), to_write),
-                    asio::use_awaitable);
+                    asio::buffer( write_buf.data(), to_write ),
+                    asio::use_awaitable );
                 total_written += n;
             }
-            writer.shutdown(tcp::socket::shutdown_send);
+            writer.shutdown( tcp::socket::shutdown_send );
         }
-        catch (std::exception const&) {}
+        catch( std::exception const& ) {}
     };
 
-    // Reader coroutine
     auto read_task = [&]() -> asio::awaitable<void>
     {
         try
         {
-            while (total_read < total_bytes)
+            while( total_read < total_bytes )
             {
                 auto n = co_await reader.async_read_some(
-                    asio::buffer(read_buf.data(), read_buf.size()),
-                    asio::use_awaitable);
-                if (n == 0)
+                    asio::buffer( read_buf.data(), read_buf.size() ),
+                    asio::use_awaitable );
+                if( n == 0 )
                     break;
                 total_read += n;
             }
         }
-        catch (std::exception const&) {}
+        catch( std::exception const& ) {}
     };
 
     bench::stopwatch sw;
 
-    asio::co_spawn(ioc, write_task(), asio::detached);
-    asio::co_spawn(ioc, read_task(), asio::detached);
+    asio::co_spawn( ioc, write_task(), asio::detached );
+    asio::co_spawn( ioc, read_task(), asio::detached );
     ioc.run();
 
     double elapsed = sw.elapsed_seconds();
-    double throughput = static_cast<double>(total_read) / elapsed;
+    double throughput = static_cast<double>( total_read ) / elapsed;
 
     std::cout << "    Written:    " << total_written << " bytes\n";
     std::cout << "    Read:       " << total_read << " bytes\n";
-    std::cout << "    Elapsed:    " << std::fixed << std::setprecision(3)
+    std::cout << "    Elapsed:    " << std::fixed << std::setprecision( 3 )
               << elapsed << " s\n";
-    std::cout << "    Throughput: " << bench::format_throughput(throughput) << "\n\n";
+    std::cout << "    Throughput: " << bench::format_throughput( throughput ) << "\n\n";
 
     writer.close();
     reader.close();
 
-    return bench::benchmark_result("throughput_" + std::to_string(chunk_size))
-        .add("chunk_size", static_cast<double>(chunk_size))
-        .add("total_bytes", static_cast<double>(total_bytes))
-        .add("bytes_written", static_cast<double>(total_written))
-        .add("bytes_read", static_cast<double>(total_read))
-        .add("elapsed_s", elapsed)
-        .add("throughput_bytes_per_sec", throughput);
+    return bench::benchmark_result( "throughput_" + std::to_string( chunk_size ) )
+        .add( "chunk_size", static_cast<double>( chunk_size ) )
+        .add( "total_bytes", static_cast<double>( total_bytes ) )
+        .add( "bytes_written", static_cast<double>( total_written ) )
+        .add( "bytes_read", static_cast<double>( total_read ) )
+        .add( "elapsed_s", elapsed )
+        .add( "throughput_bytes_per_sec", throughput );
 }
 
-// Measures Asio's full-duplex throughput with simultaneous send/receive. Four
-// concurrent coroutines stress the scheduler's I/O multiplexing. Compare against
-// Corosio for protocols requiring bidirectional data flow like WebSocket or gRPC.
-bench::benchmark_result bench_bidirectional_throughput(std::size_t chunk_size, std::size_t total_bytes)
+bench::benchmark_result bench_bidirectional_throughput( std::size_t chunk_size, std::size_t total_bytes )
 {
     std::cout << "  Buffer size: " << chunk_size << " bytes, ";
-    std::cout << "Transfer: " << (total_bytes / (1024 * 1024)) << " MB each direction\n";
+    std::cout << "Transfer: " << ( total_bytes / ( 1024 * 1024 ) ) << " MB each direction\n";
 
     asio::io_context ioc;
-    auto [sock1, sock2] = make_socket_pair(ioc);
+    auto [sock1, sock2] = make_socket_pair( ioc );
 
-    std::vector<char> buf1(chunk_size, 'a');
-    std::vector<char> buf2(chunk_size, 'b');
+    std::vector<char> buf1( chunk_size, 'a' );
+    std::vector<char> buf2( chunk_size, 'b' );
 
     std::size_t written1 = 0, read1 = 0;
     std::size_t written2 = 0, read2 = 0;
 
-    // Socket 1 writes to socket 2
     auto write1_task = [&]() -> asio::awaitable<void>
     {
         try
         {
-            while (written1 < total_bytes)
+            while( written1 < total_bytes )
             {
-                std::size_t to_write = (std::min)(chunk_size, total_bytes - written1);
+                std::size_t to_write = ( std::min )( chunk_size, total_bytes - written1 );
                 auto n = co_await sock1.async_write_some(
-                    asio::buffer(buf1.data(), to_write),
-                    asio::use_awaitable);
+                    asio::buffer( buf1.data(), to_write ),
+                    asio::use_awaitable );
                 written1 += n;
             }
-            sock1.shutdown(tcp::socket::shutdown_send);
+            sock1.shutdown( tcp::socket::shutdown_send );
         }
-        catch (std::exception const&) {}
+        catch( std::exception const& ) {}
     };
 
-    // Socket 2 reads from socket 1
     auto read1_task = [&]() -> asio::awaitable<void>
     {
         try
         {
-            std::vector<char> rbuf(chunk_size);
-            while (read1 < total_bytes)
+            std::vector<char> rbuf( chunk_size );
+            while( read1 < total_bytes )
             {
                 auto n = co_await sock2.async_read_some(
-                    asio::buffer(rbuf.data(), rbuf.size()),
-                    asio::use_awaitable);
-                if (n == 0) break;
+                    asio::buffer( rbuf.data(), rbuf.size() ),
+                    asio::use_awaitable );
+                if( n == 0 ) break;
                 read1 += n;
             }
         }
-        catch (std::exception const&) {}
+        catch( std::exception const& ) {}
     };
 
-    // Socket 2 writes to socket 1
     auto write2_task = [&]() -> asio::awaitable<void>
     {
         try
         {
-            while (written2 < total_bytes)
+            while( written2 < total_bytes )
             {
-                std::size_t to_write = (std::min)(chunk_size, total_bytes - written2);
+                std::size_t to_write = ( std::min )( chunk_size, total_bytes - written2 );
                 auto n = co_await sock2.async_write_some(
-                    asio::buffer(buf2.data(), to_write),
-                    asio::use_awaitable);
+                    asio::buffer( buf2.data(), to_write ),
+                    asio::use_awaitable );
                 written2 += n;
             }
-            sock2.shutdown(tcp::socket::shutdown_send);
+            sock2.shutdown( tcp::socket::shutdown_send );
         }
-        catch (std::exception const&) {}
+        catch( std::exception const& ) {}
     };
 
-    // Socket 1 reads from socket 2
     auto read2_task = [&]() -> asio::awaitable<void>
     {
         try
         {
-            std::vector<char> rbuf(chunk_size);
-            while (read2 < total_bytes)
+            std::vector<char> rbuf( chunk_size );
+            while( read2 < total_bytes )
             {
                 auto n = co_await sock1.async_read_some(
-                    asio::buffer(rbuf.data(), rbuf.size()),
-                    asio::use_awaitable);
-                if (n == 0) break;
+                    asio::buffer( rbuf.data(), rbuf.size() ),
+                    asio::use_awaitable );
+                if( n == 0 ) break;
                 read2 += n;
             }
         }
-        catch (std::exception const&) {}
+        catch( std::exception const& ) {}
     };
 
     bench::stopwatch sw;
 
-    asio::co_spawn(ioc, write1_task(), asio::detached);
-    asio::co_spawn(ioc, read1_task(), asio::detached);
-    asio::co_spawn(ioc, write2_task(), asio::detached);
-    asio::co_spawn(ioc, read2_task(), asio::detached);
+    asio::co_spawn( ioc, write1_task(), asio::detached );
+    asio::co_spawn( ioc, read1_task(), asio::detached );
+    asio::co_spawn( ioc, write2_task(), asio::detached );
+    asio::co_spawn( ioc, read2_task(), asio::detached );
     ioc.run();
 
     double elapsed = sw.elapsed_seconds();
     std::size_t total_transferred = read1 + read2;
-    double throughput = static_cast<double>(total_transferred) / elapsed;
+    double throughput = static_cast<double>( total_transferred ) / elapsed;
 
     std::cout << "    Direction 1: " << read1 << " bytes\n";
     std::cout << "    Direction 2: " << read2 << " bytes\n";
     std::cout << "    Total:       " << total_transferred << " bytes\n";
-    std::cout << "    Elapsed:     " << std::fixed << std::setprecision(3)
+    std::cout << "    Elapsed:     " << std::fixed << std::setprecision( 3 )
               << elapsed << " s\n";
-    std::cout << "    Throughput:  " << bench::format_throughput(throughput)
+    std::cout << "    Throughput:  " << bench::format_throughput( throughput )
               << " (combined)\n\n";
 
     sock1.close();
     sock2.close();
 
-    return bench::benchmark_result("bidirectional_" + std::to_string(chunk_size))
-        .add("chunk_size", static_cast<double>(chunk_size))
-        .add("total_bytes_per_direction", static_cast<double>(total_bytes))
-        .add("bytes_direction1", static_cast<double>(read1))
-        .add("bytes_direction2", static_cast<double>(read2))
-        .add("total_transferred", static_cast<double>(total_transferred))
-        .add("elapsed_s", elapsed)
-        .add("throughput_bytes_per_sec", throughput);
+    return bench::benchmark_result( "bidirectional_" + std::to_string( chunk_size ) )
+        .add( "chunk_size", static_cast<double>( chunk_size ) )
+        .add( "total_bytes_per_direction", static_cast<double>( total_bytes ) )
+        .add( "bytes_direction1", static_cast<double>( read1 ) )
+        .add( "bytes_direction2", static_cast<double>( read2 ) )
+        .add( "total_transferred", static_cast<double>( total_transferred ) )
+        .add( "elapsed_s", elapsed )
+        .add( "throughput_bytes_per_sec", throughput );
 }
 
-// Run benchmarks
-void run_benchmarks(const char* output_file, const char* bench_filter)
-{
-    std::cout << "Boost.Asio Socket Throughput Benchmarks\n";
-    std::cout << "=======================================\n";
-
-    bench::result_collector collector("asio");
+} // anonymous namespace
 
-    bool run_all = !bench_filter || std::strcmp(bench_filter, "all") == 0;
+void run_socket_throughput_benchmarks(
+    bench::result_collector& collector,
+    char const* filter )
+{
+    std::cout << "\n>>> Socket Throughput Benchmarks (Asio) <<<\n";
 
-    // Variable buffer sizes
-    std::vector<std::size_t> buffer_sizes = {1024, 4096, 16384, 65536};
-    std::size_t transfer_size = 64 * 1024 * 1024; // 64 MB
+    bool run_all = !filter || std::strcmp( filter, "all" ) == 0;
 
-    if (run_all || std::strcmp(bench_filter, "unidirectional") == 0)
-    {
-        bench::print_header("Unidirectional Throughput (Asio)");
-        for (auto size : buffer_sizes)
-            collector.add(bench_throughput(size, transfer_size));
-    }
+    std::vector<std::size_t> buffer_sizes = { 1024, 4096, 16384, 65536 };
+    std::size_t transfer_size = 64 * 1024 * 1024;
 
-    if (run_all || std::strcmp(bench_filter, "bidirectional") == 0)
+    if( run_all || std::strcmp( filter, "unidirectional" ) == 0 )
     {
-        bench::print_header("Bidirectional Throughput (Asio)");
-        for (auto size : buffer_sizes)
-            collector.add(bench_bidirectional_throughput(size, transfer_size / 2));
+        bench::print_header( "Unidirectional Throughput (Asio)" );
+        for( auto size : buffer_sizes )
+            collector.add( bench_throughput( size, transfer_size ) );
     }
 
-    std::cout << "\nBenchmarks complete.\n";
-
-    if (output_file)
+    if( run_all || std::strcmp( filter, "bidirectional" ) == 0 )
     {
-        if (collector.write_json(output_file))
-            std::cout << "Results written to: " << output_file << "\n";
-        else
-            std::cerr << "Error: Failed to write results to: " << output_file << "\n";
+        bench::print_header( "Bidirectional Throughput (Asio)" );
+        for( auto size : buffer_sizes )
+            collector.add( bench_bidirectional_throughput( size, transfer_size / 2 ) );
     }
 }
 
-void print_usage(const char* program_name)
-{
-    std::cout << "Usage: " << program_name << " [OPTIONS]\n\n";
-    std::cout << "Options:\n";
-    std::cout << "  --bench <name>     Run only the specified benchmark\n";
-    std::cout << "  --output <file>    Write JSON results to file\n";
-    std::cout << "  --help             Show this help message\n";
-    std::cout << "\n";
-    std::cout << "Available benchmarks:\n";
-    std::cout << "  unidirectional     Unidirectional throughput (various buffer sizes)\n";
-    std::cout << "  bidirectional      Bidirectional throughput (various buffer sizes)\n";
-    std::cout << "  all                Run all benchmarks (default)\n";
-}
-
-int main(int argc, char* argv[])
-{
-    const char* output_file = nullptr;
-    const char* bench_filter = nullptr;
-
-    for (int i = 1; i < argc; ++i)
-    {
-        if (std::strcmp(argv[i], "--bench") == 0)
-        {
-            if (i + 1 < argc)
-            {
-                bench_filter = argv[++i];
-            }
-            else
-            {
-                std::cerr << "Error: --bench requires an argument\n";
-                return 1;
-            }
-        }
-        else if (std::strcmp(argv[i], "--output") == 0)
-        {
-            if (i + 1 < argc)
-            {
-                output_file = argv[++i];
-            }
-            else
-            {
-                std::cerr << "Error: --output requires an argument\n";
-                return 1;
-            }
-        }
-        else if (std::strcmp(argv[i], "--help") == 0 || std::strcmp(argv[i], "-h") == 0)
-        {
-            print_usage(argv[0]);
-            return 0;
-        }
-        else
-        {
-            std::cerr << "Unknown option: " << argv[i] << "\n";
-            print_usage(argv[0]);
-            return 1;
-        }
-    }
-
-    run_benchmarks(output_file, bench_filter);
-    return 0;
-}
+} // namespace asio_bench
diff --git a/bench/asio/socket_utils.hpp b/bench/asio/socket_utils.hpp
new file mode 100644
index 00000000..00f112de
--- /dev/null
+++ b/bench/asio/socket_utils.hpp
@@ -0,0 +1,44 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/corosio
+//
+
+#ifndef ASIO_BENCH_SOCKET_UTILS_HPP
+#define ASIO_BENCH_SOCKET_UTILS_HPP
+
+#include <boost/asio/io_context.hpp>
+#include <boost/asio/ip/tcp.hpp>
+
+#include <utility>
+
+namespace asio_bench {
+
+namespace asio = boost::asio;
+using tcp = asio::ip::tcp;
+
+/** Create a connected pair of TCP sockets for benchmarking. */
+inline std::pair<tcp::socket, tcp::socket> make_socket_pair( asio::io_context& ioc )
+{
+    tcp::acceptor acceptor( ioc, tcp::endpoint( tcp::v4(), 0 ) );
+    acceptor.set_option( tcp::acceptor::reuse_address( true ) );
+
+    tcp::socket client( ioc );
+    tcp::socket server( ioc );
+
+    auto endpoint = acceptor.local_endpoint();
+    client.connect( tcp::endpoint( asio::ip::address_v4::loopback(), endpoint.port() ) );
+    server = acceptor.accept();
+
+    client.set_option( tcp::no_delay( true ) );
+    server.set_option( tcp::no_delay( true ) );
+
+    return { std::move( client ), std::move( server ) };
+}
+
+} // namespace asio_bench
+
+#endif
diff --git a/bench/corosio/CMakeLists.txt b/bench/corosio/CMakeLists.txt
index f42c0312..8dda6406 100644
--- a/bench/corosio/CMakeLists.txt
+++ b/bench/corosio/CMakeLists.txt
@@ -8,21 +8,20 @@
 # Official repository: https://github.com/cppalliance/corosio
 #
 
-# Corosio benchmark executables
+add_executable(corosio_bench
+    main.cpp
+    io_context_bench.cpp
+    socket_throughput_bench.cpp
+    socket_latency_bench.cpp
+    http_server_bench.cpp)
 
-function(corosio_add_benchmark name source)
-    add_executable(${name} ${source})
-    target_link_libraries(${name}
-        PRIVATE
-            Boost::corosio
-            Threads::Threads)
-    set_property(TARGET ${name} PROPERTY FOLDER "benchmarks/corosio")
-    if (COROSIO_BENCH_LTO_SUPPORTED)
-        set_property(TARGET ${name} PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
-    endif ()
-endfunction()
+target_link_libraries(corosio_bench
+    PRIVATE
+        Boost::corosio
+        Threads::Threads)
 
-corosio_add_benchmark(corosio_bench_io_context io_context_bench.cpp)
-corosio_add_benchmark(corosio_bench_socket_throughput socket_throughput_bench.cpp)
-corosio_add_benchmark(corosio_bench_socket_latency socket_latency_bench.cpp)
-corosio_add_benchmark(corosio_bench_http_server http_server_bench.cpp)
+set_property(TARGET corosio_bench PROPERTY FOLDER "benchmarks/corosio")
+
+if (COROSIO_BENCH_LTO_SUPPORTED)
+    set_property(TARGET corosio_bench PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
+endif ()
diff --git a/bench/corosio/benchmarks.hpp b/bench/corosio/benchmarks.hpp
new file mode 100644
index 00000000..e3567618
--- /dev/null
+++ b/bench/corosio/benchmarks.hpp
@@ -0,0 +1,63 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/corosio
+//
+
+#ifndef COROSIO_BENCH_BENCHMARKS_HPP
+#define COROSIO_BENCH_BENCHMARKS_HPP
+
+#include "../common/benchmark.hpp"
+
+namespace corosio_bench {
+
+/** Run io_context benchmarks for the given context type.
+
+    @param collector Results collector.
+    @param filter Optional filter: nullptr or "all" runs all, or a specific
+           benchmark name (single_threaded, multithreaded, interleaved, concurrent).
+*/
+template<typename Context>
+void run_io_context_benchmarks(
+    bench::result_collector& collector,
+    char const* filter );
+
+/** Run socket throughput benchmarks for the given context type.
+
+    @param collector Results collector.
+    @param filter Optional filter: nullptr or "all" runs all, or a specific
+           benchmark name (unidirectional, bidirectional).
+*/
+template<typename Context>
+void run_socket_throughput_benchmarks(
+    bench::result_collector& collector,
+    char const* filter );
+
+/** Run socket latency benchmarks for the given context type.
+
+    @param collector Results collector.
+    @param filter Optional filter: nullptr or "all" runs all, or a specific
+           benchmark name (pingpong, concurrent).
+*/
+template<typename Context>
+void run_socket_latency_benchmarks(
+    bench::result_collector& collector,
+    char const* filter );
+
+/** Run HTTP server benchmarks for the given context type.
+
+    @param collector Results collector.
+    @param filter Optional filter: nullptr or "all" runs all, or a specific
+           benchmark name (single_conn, concurrent, multithread).
+*/
+template<typename Context>
+void run_http_server_benchmarks(
+    bench::result_collector& collector,
+    char const* filter );
+
+} // namespace corosio_bench
+
+#endif
diff --git a/bench/corosio/http_server_bench.cpp b/bench/corosio/http_server_bench.cpp
index 94a087c5..8f719c7e 100644
--- a/bench/corosio/http_server_bench.cpp
+++ b/bench/corosio/http_server_bench.cpp
@@ -7,7 +7,10 @@
 // Official repository: https://github.com/cppalliance/corosio
 //
 
+#include "benchmarks.hpp"
+
 #include <boost/corosio/io_context.hpp>
+#include <boost/corosio/detail/platform.hpp>
 #include <boost/corosio/tcp_socket.hpp>
 #include <boost/corosio/test/socket_pair.hpp>
 #include <boost/capy/buffers.hpp>
@@ -25,145 +28,137 @@
 #include <thread>
 #include <vector>
 
-#include "../common/backend_selection.hpp"
 #include "../common/benchmark.hpp"
 #include "../common/http_protocol.hpp"
 
 namespace corosio = boost::corosio;
 namespace capy = boost::capy;
 
-// Server coroutine: reads requests and sends responses
+namespace corosio_bench {
+namespace {
+
 capy::task<> server_task(
     corosio::tcp_socket& sock,
     int num_requests,
-    int& completed_requests)
+    int& completed_requests )
 {
     std::string buf;
 
-    while (completed_requests < num_requests)
+    while( completed_requests < num_requests )
     {
-        // Read until end of HTTP headers
         auto [ec, n] = co_await capy::read_until(
-            sock, capy::dynamic_buffer(buf), "\r\n\r\n");
-        if (ec)
+            sock, capy::dynamic_buffer( buf ), "\r\n\r\n" );
+        if( ec )
             co_return;
 
-        // Send response
         auto [wec, wn] = co_await capy::write(
-            sock, capy::const_buffer(bench::http::small_response, bench::http::small_response_size));
-        if (wec)
+            sock, capy::const_buffer( bench::http::small_response, bench::http::small_response_size ) );
+        if( wec )
             co_return;
 
         ++completed_requests;
-        buf.erase(0, n);
+        buf.erase( 0, n );
     }
 }
 
-// Client coroutine: sends requests and reads responses
 capy::task<> client_task(
     corosio::tcp_socket& sock,
     int num_requests,
-    bench::statistics& latency_stats)
+    bench::statistics& latency_stats )
 {
     std::string buf;
 
-    for (int i = 0; i < num_requests; ++i)
+    for( int i = 0; i < num_requests; ++i )
     {
         bench::stopwatch sw;
 
-        // Send request
         auto [wec, wn] = co_await capy::write(
-            sock, capy::const_buffer(bench::http::small_request, bench::http::small_request_size));
-        if (wec)
+            sock, capy::const_buffer( bench::http::small_request, bench::http::small_request_size ) );
+        if( wec )
             co_return;
 
-        // Read response headers
         auto [ec, header_end] = co_await capy::read_until(
-            sock, capy::dynamic_buffer(buf), "\r\n\r\n");
-        if (ec)
+            sock, capy::dynamic_buffer( buf ), "\r\n\r\n" );
+        if( ec )
             co_return;
 
-        // Parse Content-Length from headers and read body if needed
-        std::string_view headers(buf.data(), header_end);
+        std::string_view headers( buf.data(), header_end );
         std::size_t content_length = 0;
-        auto pos = headers.find("Content-Length: ");
-        if (pos != std::string_view::npos)
+        auto pos = headers.find( "Content-Length: " );
+        if( pos != std::string_view::npos )
         {
             pos += 16;
-            while (pos < headers.size() && headers[pos] >= '0' && headers[pos] <= '9')
+            while( pos < headers.size() && headers[pos] >= '0' && headers[pos] <= '9' )
             {
-                content_length = content_length * 10 + (headers[pos] - '0');
+                content_length = content_length * 10 + ( headers[pos] - '0' );
                 ++pos;
             }
         }
 
-        // Read body if not already in buffer
         std::size_t total_size = header_end + content_length;
-        if (buf.size() < total_size)
+        if( buf.size() < total_size )
         {
             std::size_t need = total_size - buf.size();
             std::size_t old_size = buf.size();
-            buf.resize(total_size);
+            buf.resize( total_size );
             auto [rec, rn] = co_await capy::read(
-                sock, capy::mutable_buffer(buf.data() + old_size, need));
-            if (rec)
+                sock, capy::mutable_buffer( buf.data() + old_size, need ) );
+            if( rec )
                 co_return;
         }
 
         double latency_us = sw.elapsed_us();
-        latency_stats.add(latency_us);
+        latency_stats.add( latency_us );
 
-        buf.erase(0, total_size);
+        buf.erase( 0, total_size );
     }
 }
 
-// Single connection benchmark
 template<typename Context>
-bench::benchmark_result bench_single_connection(int num_requests)
+bench::benchmark_result bench_single_connection( int num_requests )
 {
     std::cout << "  Requests: " << num_requests << "\n";
 
     Context ioc;
-    auto [client, server] = corosio::test::make_socket_pair(ioc);
+    auto [client, server] = corosio::test::make_socket_pair( ioc );
 
-    client.set_no_delay(true);
-    server.set_no_delay(true);
+    client.set_no_delay( true );
+    server.set_no_delay( true );
 
     int completed_requests = 0;
     bench::statistics latency_stats;
 
     bench::stopwatch total_sw;
 
-    capy::run_async(ioc.get_executor())(
-        server_task(server, num_requests, completed_requests));
-    capy::run_async(ioc.get_executor())(
-        client_task(client, num_requests, latency_stats));
+    capy::run_async( ioc.get_executor() )(
+        server_task( server, num_requests, completed_requests ) );
+    capy::run_async( ioc.get_executor() )(
+        client_task( client, num_requests, latency_stats ) );
 
     ioc.run();
 
     double elapsed = total_sw.elapsed_seconds();
-    double requests_per_sec = static_cast<double>(num_requests) / elapsed;
+    double requests_per_sec = static_cast<double>( num_requests ) / elapsed;
 
     std::cout << "    Completed: " << num_requests << " requests\n";
-    std::cout << "    Elapsed: " << std::fixed << std::setprecision(3)
+    std::cout << "    Elapsed: " << std::fixed << std::setprecision( 3 )
               << elapsed << " s\n";
-    std::cout << "    Throughput: " << bench::format_rate(requests_per_sec) << "\n";
-    bench::print_latency_stats(latency_stats, "Request latency");
+    std::cout << "    Throughput: " << bench::format_rate( requests_per_sec ) << "\n";
+    bench::print_latency_stats( latency_stats, "Request latency" );
     std::cout << "\n";
 
     client.close();
     server.close();
 
-    return bench::benchmark_result("single_conn")
-        .add("num_requests", num_requests)
-        .add("num_connections", 1)
-        .add("requests_per_sec", requests_per_sec)
-        .add_latency_stats("request_latency", latency_stats);
+    return bench::benchmark_result( "single_conn" )
+        .add( "num_requests", num_requests )
+        .add( "num_connections", 1 )
+        .add( "requests_per_sec", requests_per_sec )
+        .add_latency_stats( "request_latency", latency_stats );
 }
 
-// Concurrent connections benchmark
 template<typename Context>
-bench::benchmark_result bench_concurrent_connections(int num_connections, int requests_per_conn)
+bench::benchmark_result bench_concurrent_connections( int num_connections, int requests_per_conn )
 {
     int total_requests = num_connections * requests_per_conn;
     std::cout << "  Connections: " << num_connections
@@ -174,71 +169,69 @@ bench::benchmark_result bench_concurrent_connections(int num_connections, int re
 
     std::vector<corosio::tcp_socket> clients;
     std::vector<corosio::tcp_socket> servers;
-    std::vector<int> completed(num_connections, 0);
-    std::vector<bench::statistics> stats(num_connections);
+    std::vector<int> completed( num_connections, 0 );
+    std::vector<bench::statistics> stats( num_connections );
 
-    clients.reserve(num_connections);
-    servers.reserve(num_connections);
+    clients.reserve( num_connections );
+    servers.reserve( num_connections );
 
-    for (int i = 0; i < num_connections; ++i)
+    for( int i = 0; i < num_connections; ++i )
     {
-        auto [c, s] = corosio::test::make_socket_pair(ioc);
-        c.set_no_delay(true);
-        s.set_no_delay(true);
-        clients.push_back(std::move(c));
-        servers.push_back(std::move(s));
+        auto [c, s] = corosio::test::make_socket_pair( ioc );
+        c.set_no_delay( true );
+        s.set_no_delay( true );
+        clients.push_back( std::move( c ) );
+        servers.push_back( std::move( s ) );
     }
 
     bench::stopwatch total_sw;
 
-    for (int i = 0; i < num_connections; ++i)
+    for( int i = 0; i < num_connections; ++i )
     {
-        capy::run_async(ioc.get_executor())(
-            server_task(servers[i], requests_per_conn, completed[i]));
-        capy::run_async(ioc.get_executor())(
-            client_task(clients[i], requests_per_conn, stats[i]));
+        capy::run_async( ioc.get_executor() )(
+            server_task( servers[i], requests_per_conn, completed[i] ) );
+        capy::run_async( ioc.get_executor() )(
+            client_task( clients[i], requests_per_conn, stats[i] ) );
     }
 
     ioc.run();
 
     double elapsed = total_sw.elapsed_seconds();
-    double requests_per_sec = static_cast<double>(total_requests) / elapsed;
+    double requests_per_sec = static_cast<double>( total_requests ) / elapsed;
 
-    // Aggregate latency stats
     double total_mean = 0;
     double total_p99 = 0;
-    for (auto& s : stats)
+    for( auto& s : stats )
     {
         total_mean += s.mean();
         total_p99 += s.p99();
     }
 
     std::cout << "    Completed: " << total_requests << " requests\n";
-    std::cout << "    Elapsed: " << std::fixed << std::setprecision(3)
+    std::cout << "    Elapsed: " << std::fixed << std::setprecision( 3 )
               << elapsed << " s\n";
-    std::cout << "    Throughput: " << bench::format_rate(requests_per_sec) << "\n";
+    std::cout << "    Throughput: " << bench::format_rate( requests_per_sec ) << "\n";
     std::cout << "    Avg mean latency: "
-              << bench::format_latency(total_mean / num_connections) << "\n";
+              << bench::format_latency( total_mean / num_connections ) << "\n";
     std::cout << "    Avg p99 latency: "
-              << bench::format_latency(total_p99 / num_connections) << "\n\n";
+              << bench::format_latency( total_p99 / num_connections ) << "\n\n";
 
-    for (auto& c : clients)
+    for( auto& c : clients )
         c.close();
-    for (auto& s : servers)
+    for( auto& s : servers )
         s.close();
 
-    return bench::benchmark_result("concurrent_" + std::to_string(num_connections))
-        .add("num_connections", num_connections)
-        .add("requests_per_conn", requests_per_conn)
-        .add("total_requests", total_requests)
-        .add("requests_per_sec", requests_per_sec)
-        .add("avg_mean_latency_us", total_mean / num_connections)
-        .add("avg_p99_latency_us", total_p99 / num_connections);
+    return bench::benchmark_result( "concurrent_" + std::to_string( num_connections ) )
+        .add( "num_connections", num_connections )
+        .add( "requests_per_conn", requests_per_conn )
+        .add( "total_requests", total_requests )
+        .add( "requests_per_sec", requests_per_sec )
+        .add( "avg_mean_latency_us", total_mean / num_connections )
+        .add( "avg_p99_latency_us", total_p99 / num_connections );
 }
 
-// Multi-threaded benchmark: multiple threads calling run()
 template<typename Context>
-bench::benchmark_result bench_multithread(int num_threads, int num_connections, int requests_per_conn)
+bench::benchmark_result bench_multithread( int num_threads, int num_connections, int requests_per_conn )
 {
     int total_requests = num_connections * requests_per_conn;
     std::cout << "  Threads: " << num_threads
@@ -250,221 +243,128 @@ bench::benchmark_result bench_multithread(int num_threads, int num_connections,
 
     std::vector<corosio::tcp_socket> clients;
     std::vector<corosio::tcp_socket> servers;
-    std::vector<int> completed(num_connections, 0);
-    std::vector<bench::statistics> stats(num_connections);
+    std::vector<int> completed( num_connections, 0 );
+    std::vector<bench::statistics> stats( num_connections );
 
-    clients.reserve(num_connections);
-    servers.reserve(num_connections);
+    clients.reserve( num_connections );
+    servers.reserve( num_connections );
 
-    for (int i = 0; i < num_connections; ++i)
+    for( int i = 0; i < num_connections; ++i )
     {
-        auto [c, s] = corosio::test::make_socket_pair(ioc);
-        c.set_no_delay(true);
-        s.set_no_delay(true);
-        clients.push_back(std::move(c));
-        servers.push_back(std::move(s));
+        auto [c, s] = corosio::test::make_socket_pair( ioc );
+        c.set_no_delay( true );
+        s.set_no_delay( true );
+        clients.push_back( std::move( c ) );
+        servers.push_back( std::move( s ) );
     }
 
-    // Spawn all coroutines before starting threads
-    for (int i = 0; i < num_connections; ++i)
+    for( int i = 0; i < num_connections; ++i )
     {
-        capy::run_async(ioc.get_executor())(
-            server_task(servers[i], requests_per_conn, completed[i]));
-        capy::run_async(ioc.get_executor())(
-            client_task(clients[i], requests_per_conn, stats[i]));
+        capy::run_async( ioc.get_executor() )(
+            server_task( servers[i], requests_per_conn, completed[i] ) );
+        capy::run_async( ioc.get_executor() )(
+            client_task( clients[i], requests_per_conn, stats[i] ) );
     }
 
     bench::stopwatch total_sw;
 
-    // Launch worker threads
     std::vector<std::thread> threads;
-    threads.reserve(num_threads - 1);
-    for (int i = 1; i < num_threads; ++i)
-        threads.emplace_back([&ioc] { ioc.run(); });
+    threads.reserve( num_threads - 1 );
+    for( int i = 1; i < num_threads; ++i )
+        threads.emplace_back( [&ioc] { ioc.run(); } );
 
-    // Main thread also runs
     ioc.run();
 
-    // Wait for all threads
-    for (auto& t : threads)
+    for( auto& t : threads )
         t.join();
 
     double elapsed = total_sw.elapsed_seconds();
-    double requests_per_sec = static_cast<double>(total_requests) / elapsed;
+    double requests_per_sec = static_cast<double>( total_requests ) / elapsed;
 
-    // Aggregate latency stats
     double total_mean = 0;
     double total_p99 = 0;
-    for (auto& s : stats)
+    for( auto& s : stats )
     {
         total_mean += s.mean();
         total_p99 += s.p99();
     }
 
     std::cout << "    Completed: " << total_requests << " requests\n";
-    std::cout << "    Elapsed: " << std::fixed << std::setprecision(3)
+    std::cout << "    Elapsed: " << std::fixed << std::setprecision( 3 )
               << elapsed << " s\n";
-    std::cout << "    Throughput: " << bench::format_rate(requests_per_sec) << "\n";
+    std::cout << "    Throughput: " << bench::format_rate( requests_per_sec ) << "\n";
     std::cout << "    Avg mean latency: "
-              << bench::format_latency(total_mean / num_connections) << "\n";
+              << bench::format_latency( total_mean / num_connections ) << "\n";
     std::cout << "    Avg p99 latency: "
-              << bench::format_latency(total_p99 / num_connections) << "\n\n";
+              << bench::format_latency( total_p99 / num_connections ) << "\n\n";
 
-    for (auto& c : clients)
+    for( auto& c : clients )
         c.close();
-    for (auto& s : servers)
+    for( auto& s : servers )
         s.close();
 
-    return bench::benchmark_result("multithread_" + std::to_string(num_threads) + "t")
-        .add("num_threads", num_threads)
-        .add("num_connections", num_connections)
-        .add("requests_per_conn", requests_per_conn)
-        .add("total_requests", total_requests)
-        .add("requests_per_sec", requests_per_sec)
-        .add("avg_mean_latency_us", total_mean / num_connections)
-        .add("avg_p99_latency_us", total_p99 / num_connections);
+    return bench::benchmark_result( "multithread_" + std::to_string( num_threads ) + "t" )
+        .add( "num_threads", num_threads )
+        .add( "num_connections", num_connections )
+        .add( "requests_per_conn", requests_per_conn )
+        .add( "total_requests", total_requests )
+        .add( "requests_per_sec", requests_per_sec )
+        .add( "avg_mean_latency_us", total_mean / num_connections )
+        .add( "avg_p99_latency_us", total_p99 / num_connections );
 }
 
-// Run benchmarks for a specific context type
+} // anonymous namespace
+
 template<typename Context>
-void run_benchmarks(char const* backend_name, char const* output_file, char const* bench_filter)
+void run_http_server_benchmarks(
+    bench::result_collector& collector,
+    char const* filter )
 {
-    std::cout << "Boost.Corosio HTTP Server Benchmarks\n";
-    std::cout << "====================================\n";
-    std::cout << "Backend: " << backend_name << "\n\n";
+    std::cout << "\n>>> HTTP Server Benchmarks <<<\n";
 
-    bench::result_collector collector(backend_name);
-
-    bool run_all = !bench_filter || std::strcmp(bench_filter, "all") == 0;
-
-    if (run_all || std::strcmp(bench_filter, "single_conn") == 0)
-    {
-        bench::print_header("Single Connection (Sequential Requests)");
-        collector.add(bench_single_connection<Context>(10000));
-    }
+    bool run_all = !filter || std::strcmp( filter, "all" ) == 0;
 
-    if (run_all || std::strcmp(bench_filter, "concurrent") == 0)
+    if( run_all || std::strcmp( filter, "single_conn" ) == 0 )
     {
-        if (run_all)
-            std::this_thread::sleep_for(std::chrono::seconds(5));
-        bench::print_header("Concurrent Connections");
-        collector.add(bench_concurrent_connections<Context>(1, 10000));
-        collector.add(bench_concurrent_connections<Context>(4, 2500));
-        collector.add(bench_concurrent_connections<Context>(16, 625));
-        collector.add(bench_concurrent_connections<Context>(32, 312));
+        bench::print_header( "Single Connection (Sequential Requests)" );
+        collector.add( bench_single_connection<Context>( 10000 ) );
     }
 
-    if (run_all || std::strcmp(bench_filter, "multithread") == 0)
+    if( run_all || std::strcmp( filter, "concurrent" ) == 0 )
     {
-        if (run_all)
-            std::this_thread::sleep_for(std::chrono::seconds(5));
-        bench::print_header("Multi-threaded (32 connections, varying threads)");
-        collector.add(bench_multithread<Context>(1, 32, 312));
-        collector.add(bench_multithread<Context>(2, 32, 312));
-        collector.add(bench_multithread<Context>(4, 32, 312));
-        collector.add(bench_multithread<Context>(8, 32, 312));
+        if( run_all )
+            std::this_thread::sleep_for( std::chrono::seconds( 5 ) );
+        bench::print_header( "Concurrent Connections" );
+        collector.add( bench_concurrent_connections<Context>( 1, 10000 ) );
+        collector.add( bench_concurrent_connections<Context>( 4, 2500 ) );
+        collector.add( bench_concurrent_connections<Context>( 16, 625 ) );
+        collector.add( bench_concurrent_connections<Context>( 32, 312 ) );
     }
 
-    std::cout << "\nBenchmarks complete.\n";
-
-    if (output_file)
+    if( run_all || std::strcmp( filter, "multithread" ) == 0 )
     {
-        if (collector.write_json(output_file))
-            std::cout << "Results written to: " << output_file << "\n";
-        else
-            std::cerr << "Error: Failed to write results to: " << output_file << "\n";
+        if( run_all )
+            std::this_thread::sleep_for( std::chrono::seconds( 5 ) );
+        bench::print_header( "Multi-threaded (32 connections, varying threads)" );
+        collector.add( bench_multithread<Context>( 1, 32, 312 ) );
+        collector.add( bench_multithread<Context>( 2, 32, 312 ) );
+        collector.add( bench_multithread<Context>( 4, 32, 312 ) );
+        collector.add( bench_multithread<Context>( 8, 32, 312 ) );
     }
 }
 
-void print_usage(char const* program_name)
-{
-    std::cout << "Usage: " << program_name << " [OPTIONS]\n\n";
-    std::cout << "Options:\n";
-    std::cout << "  --backend <name>   Select I/O backend (default: platform default)\n";
-    std::cout << "  --bench <name>     Run only the specified benchmark\n";
-    std::cout << "  --output <file>    Write JSON results to file\n";
-    std::cout << "  --list             List available backends\n";
-    std::cout << "  --help             Show this help message\n";
-    std::cout << "\n";
-    std::cout << "Available benchmarks:\n";
-    std::cout << "  single_conn        Single connection, sequential requests\n";
-    std::cout << "  concurrent         Multiple concurrent connections\n";
-    std::cout << "  multithread        Multi-threaded with varying thread counts\n";
-    std::cout << "  all                Run all benchmarks (default)\n";
-    std::cout << "\n";
-    bench::print_available_backends();
-}
-
-int main(int argc, char* argv[])
-{
-    char const* backend = nullptr;
-    char const* output_file = nullptr;
-    char const* bench_filter = nullptr;
-
-    for (int i = 1; i < argc; ++i)
-    {
-        if (std::strcmp(argv[i], "--backend") == 0)
-        {
-            if (i + 1 < argc)
-            {
-                backend = argv[++i];
-            }
-            else
-            {
-                std::cerr << "Error: --backend requires an argument\n";
-                return 1;
-            }
-        }
-        else if (std::strcmp(argv[i], "--bench") == 0)
-        {
-            if (i + 1 < argc)
-            {
-                bench_filter = argv[++i];
-            }
-            else
-            {
-                std::cerr << "Error: --bench requires an argument\n";
-                return 1;
-            }
-        }
-        else if (std::strcmp(argv[i], "--output") == 0)
-        {
-            if (i + 1 < argc)
-            {
-                output_file = argv[++i];
-            }
-            else
-            {
-                std::cerr << "Error: --output requires an argument\n";
-                return 1;
-            }
-        }
-        else if (std::strcmp(argv[i], "--list") == 0)
-        {
-            bench::print_available_backends();
-            return 0;
-        }
-        else if (std::strcmp(argv[i], "--help") == 0 || std::strcmp(argv[i], "-h") == 0)
-        {
-            print_usage(argv[0]);
-            return 0;
-        }
-        else
-        {
-            std::cerr << "Unknown option: " << argv[i] << "\n";
-            print_usage(argv[0]);
-            return 1;
-        }
-    }
-
-    // If no backend specified, use platform default
-    if (!backend)
-        backend = bench::default_backend_name();
-
-    // Dispatch to the selected backend using a generic lambda
-    return bench::dispatch_backend(backend,
-        [=]<typename Context>(const char* name)
-        {
-            run_benchmarks<Context>(name, output_file, bench_filter);
-        });
-}
+// Explicit instantiations
+#if BOOST_COROSIO_HAS_EPOLL
+template void run_http_server_benchmarks<corosio::epoll_context>(
+    bench::result_collector&, char const* );
+#endif
+#if BOOST_COROSIO_HAS_SELECT
+template void run_http_server_benchmarks<corosio::select_context>(
+    bench::result_collector&, char const* );
+#endif
+#if BOOST_COROSIO_HAS_IOCP
+template void run_http_server_benchmarks<corosio::iocp_context>(
+    bench::result_collector&, char const* );
+#endif
+
+} // namespace corosio_bench
diff --git a/bench/corosio/io_context_bench.cpp b/bench/corosio/io_context_bench.cpp
index 1eee5065..57d033d8 100644
--- a/bench/corosio/io_context_bench.cpp
+++ b/bench/corosio/io_context_bench.cpp
@@ -7,6 +7,8 @@
 // Official repository: https://github.com/cppalliance/corosio
 //
 
+#include "benchmarks.hpp"
+
 #include <boost/corosio/io_context.hpp>
 #include <boost/corosio/detail/platform.hpp>
 #include <boost/capy/ex/run_async.hpp>
@@ -18,34 +20,30 @@
 #include <thread>
 #include <vector>
 
-#include "../common/backend_selection.hpp"
 #include "../common/benchmark.hpp"
 
 namespace corosio = boost::corosio;
 namespace capy = boost::capy;
 
-// Coroutine that increments a counter
-capy::task<> increment_task(int& counter)
+namespace corosio_bench {
+namespace {
+
+capy::task<> increment_task( int& counter )
 {
     ++counter;
     co_return;
 }
 
-// Coroutine that increments an atomic counter
-capy::task<> atomic_increment_task(std::atomic<int>& counter)
+capy::task<> atomic_increment_task( std::atomic<int>& counter )
 {
-    counter.fetch_add(1, std::memory_order_relaxed);
+    counter.fetch_add( 1, std::memory_order_relaxed );
     co_return;
 }
 
-// Measures the raw throughput of posting and executing coroutines from a single
-// thread. This establishes a baseline for the scheduler's best-case performance
-// without any synchronization overhead. Useful for comparing coroutine dispatch
-// efficiency against other async frameworks and identifying per-handler overhead.
-template <typename Context>
-bench::benchmark_result bench_single_threaded_post(int num_handlers)
+template<typename Context>
+bench::benchmark_result bench_single_threaded_post( int num_handlers )
 {
-    bench::print_header("Single-threaded Handler Post");
+    bench::print_header( "Single-threaded Handler Post" );
 
     Context ioc;
     auto ex = ioc.get_executor();
@@ -53,84 +51,76 @@ bench::benchmark_result bench_single_threaded_post(int num_handlers)
 
     bench::stopwatch sw;
 
-    for (int i = 0; i < num_handlers; ++i)
-        capy::run_async(ex)(increment_task(counter));
+    for( int i = 0; i < num_handlers; ++i )
+        capy::run_async( ex )( increment_task( counter ) );
 
     ioc.run();
 
     double elapsed = sw.elapsed_seconds();
-    double ops_per_sec = static_cast<double>(num_handlers) / elapsed;
+    double ops_per_sec = static_cast<double>( num_handlers ) / elapsed;
 
     std::cout << "  Handlers:    " << num_handlers << "\n";
-    std::cout << "  Elapsed:     " << std::fixed << std::setprecision(3)
+    std::cout << "  Elapsed:     " << std::fixed << std::setprecision( 3 )
               << elapsed << " s\n";
-    std::cout << "  Throughput:  " << bench::format_rate(ops_per_sec) << "\n";
+    std::cout << "  Throughput:  " << bench::format_rate( ops_per_sec ) << "\n";
 
-    if (counter != num_handlers)
+    if( counter != num_handlers )
     {
         std::cerr << "  ERROR: counter mismatch! Expected " << num_handlers
                   << ", got " << counter << "\n";
     }
 
-    return bench::benchmark_result("single_threaded_post")
-        .add("handlers", num_handlers)
-        .add("elapsed_s", elapsed)
-        .add("ops_per_sec", ops_per_sec);
+    return bench::benchmark_result( "single_threaded_post" )
+        .add( "handlers", num_handlers )
+        .add( "elapsed_s", elapsed )
+        .add( "ops_per_sec", ops_per_sec );
 }
 
-// Measures how throughput scales when multiple threads call run() on the same
-// io_context. Pre-posts all work, then times execution across 1, 2, 4, 8 threads.
-// Reveals lock contention in the scheduler's work queue. Ideal scaling would show
-// linear speedup; sub-linear or negative scaling indicates contention issues that
-// may need strand-based partitioning in real applications.
-template <typename Context>
-bench::benchmark_result bench_multithreaded_scaling(int num_handlers, int max_threads)
+template<typename Context>
+bench::benchmark_result bench_multithreaded_scaling( int num_handlers, int max_threads )
 {
-    bench::print_header("Multi-threaded Scaling");
+    bench::print_header( "Multi-threaded Scaling" );
 
     std::cout << "  Handlers per test: " << num_handlers << "\n\n";
 
-    bench::benchmark_result result("multithreaded_scaling");
-    result.add("handlers", num_handlers);
+    bench::benchmark_result result( "multithreaded_scaling" );
+    result.add( "handlers", num_handlers );
 
     double baseline_ops = 0;
-    for (int num_threads = 1; num_threads <= max_threads; num_threads *= 2)
+    for( int num_threads = 1; num_threads <= max_threads; num_threads *= 2 )
     {
         Context ioc;
         auto ex = ioc.get_executor();
-        std::atomic<int> counter{0};
+        std::atomic<int> counter{ 0 };
 
-        // Post all handlers first
-        for (int i = 0; i < num_handlers; ++i)
-            capy::run_async(ex)(atomic_increment_task(counter));
+        for( int i = 0; i < num_handlers; ++i )
+            capy::run_async( ex )( atomic_increment_task( counter ) );
 
         bench::stopwatch sw;
 
-        // Run with multiple threads
         std::vector<std::thread> runners;
-        for (int t = 0; t < num_threads; ++t)
-            runners.emplace_back([&ioc]() { ioc.run(); });
+        for( int t = 0; t < num_threads; ++t )
+            runners.emplace_back( [&ioc]() { ioc.run(); } );
 
-        for (auto& t : runners)
+        for( auto& t : runners )
             t.join();
 
         double elapsed = sw.elapsed_seconds();
-        double ops_per_sec = static_cast<double>(num_handlers) / elapsed;
+        double ops_per_sec = static_cast<double>( num_handlers ) / elapsed;
 
         std::cout << "  " << num_threads << " thread(s): "
-                  << bench::format_rate(ops_per_sec);
+                  << bench::format_rate( ops_per_sec );
 
-        if (num_threads == 1)
+        if( num_threads == 1 )
             baseline_ops = ops_per_sec;
-        else if (baseline_ops > 0)
-            std::cout << " (speedup: " << std::fixed << std::setprecision(2)
-                      << (ops_per_sec / baseline_ops) << "x)";
+        else if( baseline_ops > 0 )
+            std::cout << " (speedup: " << std::fixed << std::setprecision( 2 )
+                      << ( ops_per_sec / baseline_ops ) << "x)";
         std::cout << "\n";
 
-        // Record per-thread results
-        result.add("threads_" + std::to_string(num_threads) + "_ops_per_sec", ops_per_sec);
+        result.add( "threads_" + std::to_string( num_threads ) + "_ops_per_sec", ops_per_sec );
 
-        if (counter.load() != num_handlers)
+        if( counter.load() != num_handlers )
         {
             std::cerr << "  ERROR: counter mismatch! Expected " << num_handlers
                       << ", got " << counter.load() << "\n";
@@ -140,15 +130,10 @@ bench::benchmark_result bench_multithreaded_scaling(int num_handlers, int max_th
     return result;
 }
 
-// Measures performance when posting and polling are interleaved, simulating a
-// game loop or GUI event pump that processes available work each frame. Posts a
-// batch of handlers, calls poll() to execute ready work, then repeats. Tests the
-// efficiency of poll() with small work batches and frequent context restarts,
-// which is common in latency-sensitive applications that can't block on run().
-template <typename Context>
-bench::benchmark_result bench_interleaved_post_run(int iterations, int handlers_per_iteration)
+template<typename Context>
+bench::benchmark_result bench_interleaved_post_run( int iterations, int handlers_per_iteration )
 {
-    bench::print_header("Interleaved Post/Run");
+    bench::print_header( "Interleaved Post/Run" );
 
     Context ioc;
     auto ex = ioc.get_executor();
@@ -157,236 +142,137 @@ bench::benchmark_result bench_interleaved_post_run(int iterations, int handlers_
 
     bench::stopwatch sw;
 
-    for (int iter = 0; iter < iterations; ++iter)
+    for( int iter = 0; iter < iterations; ++iter )
     {
-        for (int i = 0; i < handlers_per_iteration; ++i)
-            capy::run_async(ex)(increment_task(counter));
+        for( int i = 0; i < handlers_per_iteration; ++i )
+            capy::run_async( ex )( increment_task( counter ) );
 
         ioc.poll();
         ioc.restart();
     }
 
-    // Run any remaining handlers
     ioc.run();
 
     double elapsed = sw.elapsed_seconds();
-    double ops_per_sec = static_cast<double>(total_handlers) / elapsed;
+    double ops_per_sec = static_cast<double>( total_handlers ) / elapsed;
 
     std::cout << "  Iterations:        " << iterations << "\n";
     std::cout << "  Handlers/iter:     " << handlers_per_iteration << "\n";
     std::cout << "  Total handlers:    " << total_handlers << "\n";
-    std::cout << "  Elapsed:           " << std::fixed << std::setprecision(3)
+    std::cout << "  Elapsed:           " << std::fixed << std::setprecision( 3 )
               << elapsed << " s\n";
-    std::cout << "  Throughput:        " << bench::format_rate(ops_per_sec) << "\n";
+    std::cout << "  Throughput:        " << bench::format_rate( ops_per_sec ) << "\n";
 
-    if (counter != total_handlers)
+    if( counter != total_handlers )
     {
         std::cerr << "  ERROR: counter mismatch! Expected " << total_handlers
                   << ", got " << counter << "\n";
     }
 
-    return bench::benchmark_result("interleaved_post_run")
-        .add("iterations", iterations)
-        .add("handlers_per_iteration", handlers_per_iteration)
-        .add("total_handlers", total_handlers)
-        .add("elapsed_s", elapsed)
-        .add("ops_per_sec", ops_per_sec);
+    return bench::benchmark_result( "interleaved_post_run" )
+        .add( "iterations", iterations )
+        .add( "handlers_per_iteration", handlers_per_iteration )
+        .add( "total_handlers", total_handlers )
+        .add( "elapsed_s", elapsed )
+        .add( "ops_per_sec", ops_per_sec );
 }
 
-// Measures performance under realistic concurrent load where multiple threads
-// simultaneously post work AND execute it. This is the most stressful test for
-// the scheduler's synchronization, as threads contend for both the submission
-// and completion paths. Simulates server workloads where worker threads both
-// generate new tasks and process existing ones, revealing producer-consumer
-// bottlenecks.
-template <typename Context>
-bench::benchmark_result bench_concurrent_post_run(int num_threads, int handlers_per_thread)
+template<typename Context>
+bench::benchmark_result bench_concurrent_post_run( int num_threads, int handlers_per_thread )
 {
-    bench::print_header("Concurrent Post and Run");
+    bench::print_header( "Concurrent Post and Run" );
 
     Context ioc;
     auto ex = ioc.get_executor();
-    std::atomic<int> counter{0};
+    std::atomic<int> counter{ 0 };
     int total_handlers = num_threads * handlers_per_thread;
 
     bench::stopwatch sw;
 
-    // Launch threads that both post and run
     std::vector<std::thread> workers;
-    for (int t = 0; t < num_threads; ++t)
+    for( int t = 0; t < num_threads; ++t )
     {
-        workers.emplace_back([&ex, &ioc, &counter, handlers_per_thread]()
+        workers.emplace_back( [&ex, &ioc, &counter, handlers_per_thread]()
         {
-            for (int i = 0; i < handlers_per_thread; ++i)
-                capy::run_async(ex)(atomic_increment_task(counter));
+            for( int i = 0; i < handlers_per_thread; ++i )
+                capy::run_async( ex )( atomic_increment_task( counter ) );
             ioc.run();
-        });
+        } );
     }
 
-    for (auto& t : workers)
+    for( auto& t : workers )
         t.join();
 
     double elapsed = sw.elapsed_seconds();
-    double ops_per_sec = static_cast<double>(total_handlers) / elapsed;
+    double ops_per_sec = static_cast<double>( total_handlers ) / elapsed;
 
     std::cout << "  Threads:           " << num_threads << "\n";
     std::cout << "  Handlers/thread:   " << handlers_per_thread << "\n";
     std::cout << "  Total handlers:    " << total_handlers << "\n";
-    std::cout << "  Elapsed:           " << std::fixed << std::setprecision(3)
+    std::cout << "  Elapsed:           " << std::fixed << std::setprecision( 3 )
               << elapsed << " s\n";
-    std::cout << "  Throughput:        " << bench::format_rate(ops_per_sec) << "\n";
+    std::cout << "  Throughput:        " << bench::format_rate( ops_per_sec ) << "\n";
 
-    if (counter.load() != total_handlers)
+    if( counter.load() != total_handlers )
     {
         std::cerr << "  ERROR: counter mismatch! Expected " << total_handlers
                   << ", got " << counter.load() << "\n";
     }
 
-    return bench::benchmark_result("concurrent_post_run")
-        .add("threads", num_threads)
-        .add("handlers_per_thread", handlers_per_thread)
-        .add("total_handlers", total_handlers)
-        .add("elapsed_s", elapsed)
-        .add("ops_per_sec", ops_per_sec);
+    return bench::benchmark_result( "concurrent_post_run" )
+        .add( "threads", num_threads )
+        .add( "handlers_per_thread", handlers_per_thread )
+        .add( "total_handlers", total_handlers )
+        .add( "elapsed_s", elapsed )
+        .add( "ops_per_sec", ops_per_sec );
 }
 
-// Run benchmarks for a specific context type
-template <typename Context>
-void run_benchmarks(const char* backend_name, const char* output_file, const char* bench_filter)
-{
-    std::cout << "Boost.Corosio io_context Benchmarks\n";
-    std::cout << "====================================\n";
-    std::cout << "Backend: " << backend_name << "\n\n";
+} // anonymous namespace
 
-    bench::result_collector collector(backend_name);
+template<typename Context>
+void run_io_context_benchmarks(
+    bench::result_collector& collector,
+    char const* filter )
+{
+    std::cout << "\n>>> io_context Benchmarks <<<\n";
 
-    bool run_all = !bench_filter || std::strcmp(bench_filter, "all") == 0;
+    bool run_all = !filter || std::strcmp( filter, "all" ) == 0;
 
     // Warm up
     {
         Context ioc;
         auto ex = ioc.get_executor();
         int counter = 0;
-        for (int i = 0; i < 1000; ++i)
-            capy::run_async(ex)(increment_task(counter));
+        for( int i = 0; i < 1000; ++i )
+            capy::run_async( ex )( increment_task( counter ) );
         ioc.run();
     }
 
-    // Run selected benchmarks
-    if (run_all || std::strcmp(bench_filter, "single_threaded") == 0)
-        collector.add(bench_single_threaded_post<Context>(1000000));
+    if( run_all || std::strcmp( filter, "single_threaded" ) == 0 )
+        collector.add( bench_single_threaded_post<Context>( 1000000 ) );
 
-    if (run_all || std::strcmp(bench_filter, "multithreaded") == 0)
-        collector.add(bench_multithreaded_scaling<Context>(1000000, 8));
+    if( run_all || std::strcmp( filter, "multithreaded" ) == 0 )
+        collector.add( bench_multithreaded_scaling<Context>( 1000000, 8 ) );
 
-    if (run_all || std::strcmp(bench_filter, "interleaved") == 0)
-        collector.add(bench_interleaved_post_run<Context>(10000, 100));
-
-    if (run_all || std::strcmp(bench_filter, "concurrent") == 0)
-        collector.add(bench_concurrent_post_run<Context>(4, 250000));
-
-    std::cout << "\nBenchmarks complete.\n";
-
-    if (output_file)
-    {
-        if (collector.write_json(output_file))
-            std::cout << "Results written to: " << output_file << "\n";
-        else
-            std::cerr << "Error: Failed to write results to: " << output_file << "\n";
-    }
-}
+    if( run_all || std::strcmp( filter, "interleaved" ) == 0 )
+        collector.add( bench_interleaved_post_run<Context>( 10000, 100 ) );
 
-void print_usage(const char* program_name)
-{
-    std::cout << "Usage: " << program_name << " [OPTIONS]\n\n";
-    std::cout << "Options:\n";
-    std::cout << "  --backend <name>   Select I/O backend (default: platform default)\n";
-    std::cout << "  --bench <name>     Run only the specified benchmark\n";
-    std::cout << "  --output <file>    Write JSON results to file\n";
-    std::cout << "  --list             List available backends\n";
-    std::cout << "  --help             Show this help message\n";
-    std::cout << "\n";
-    std::cout << "Available benchmarks:\n";
-    std::cout << "  single_threaded    Single-threaded handler post throughput\n";
-    std::cout << "  multithreaded      Multi-threaded scaling test\n";
-    std::cout << "  interleaved        Interleaved post/poll pattern\n";
-    std::cout << "  concurrent         Concurrent post and run\n";
-    std::cout << "  all                Run all benchmarks (default)\n";
-    std::cout << "\n";
-    bench::print_available_backends();
+    if( run_all || std::strcmp( filter, "concurrent" ) == 0 )
+        collector.add( bench_concurrent_post_run<Context>( 4, 250000 ) );
 }
 
-int main(int argc, char* argv[])
-{
-    const char* backend = nullptr;
-    const char* output_file = nullptr;
-    const char* bench_filter = nullptr;
-
-    // Parse command-line arguments
-    for (int i = 1; i < argc; ++i)
-    {
-        if (std::strcmp(argv[i], "--backend") == 0)
-        {
-            if (i + 1 < argc)
-            {
-                backend = argv[++i];
-            }
-            else
-            {
-                std::cerr << "Error: --backend requires an argument\n";
-                return 1;
-            }
-        }
-        else if (std::strcmp(argv[i], "--bench") == 0)
-        {
-            if (i + 1 < argc)
-            {
-                bench_filter = argv[++i];
-            }
-            else
-            {
-                std::cerr << "Error: --bench requires an argument\n";
-                return 1;
-            }
-        }
-        else if (std::strcmp(argv[i], "--output") == 0)
-        {
-            if (i + 1 < argc)
-            {
-                output_file = argv[++i];
-            }
-            else
-            {
-                std::cerr << "Error: --output requires an argument\n";
-                return 1;
-            }
-        }
-        else if (std::strcmp(argv[i], "--list") == 0)
-        {
-            bench::print_available_backends();
-            return 0;
-        }
-        else if (std::strcmp(argv[i], "--help") == 0 || std::strcmp(argv[i], "-h") == 0)
-        {
-            print_usage(argv[0]);
-            return 0;
-        }
-        else
-        {
-            std::cerr << "Unknown option: " << argv[i] << "\n";
-            print_usage(argv[0]);
-            return 1;
-        }
-    }
-
-    // If no backend specified, use platform default
-    if (!backend)
-        backend = bench::default_backend_name();
-
-    // Dispatch to the selected backend using a generic lambda
-    return bench::dispatch_backend(backend,
-        [=]<typename Context>(const char* name)
-        {
-            run_benchmarks<Context>(name, output_file, bench_filter);
-        });
-}
+// Explicit instantiations
+#if BOOST_COROSIO_HAS_EPOLL
+template void run_io_context_benchmarks<corosio::epoll_context>(
+    bench::result_collector&, char const* );
+#endif
+#if BOOST_COROSIO_HAS_SELECT
+template void run_io_context_benchmarks<corosio::select_context>(
+    bench::result_collector&, char const* );
+#endif
+#if BOOST_COROSIO_HAS_IOCP
+template void run_io_context_benchmarks<corosio::iocp_context>(
+    bench::result_collector&, char const* );
+#endif
+
+} // namespace corosio_bench
diff --git a/bench/corosio/main.cpp b/bench/corosio/main.cpp
new file mode 100644
index 00000000..28a38697
--- /dev/null
+++ b/bench/corosio/main.cpp
@@ -0,0 +1,175 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/corosio
+//
+
+#include "benchmarks.hpp"
+
+#include <boost/corosio/io_context.hpp>
+#include <boost/corosio/detail/platform.hpp>
+
+#include <cstring>
+#include <iostream>
+
+#include "../common/backend_selection.hpp"
+#include "../common/benchmark.hpp"
+
+namespace corosio = boost::corosio;
+
+namespace {
+
+template<typename Context>
+void run_benchmarks(
+    char const* backend_name,
+    char const* output_file,
+    char const* category_filter,
+    char const* bench_filter )
+{
+    std::cout << "Boost.Corosio Benchmarks\n";
+    std::cout << "========================\n";
+    std::cout << "Backend: " << backend_name << "\n";
+
+    bench::result_collector collector( backend_name );
+
+    bool run_all = !category_filter || std::strcmp( category_filter, "all" ) == 0;
+
+    if( run_all || std::strcmp( category_filter, "io_context" ) == 0 )
+        corosio_bench::run_io_context_benchmarks<Context>( collector, bench_filter );
+
+    if( run_all || std::strcmp( category_filter, "socket_throughput" ) == 0 )
+        corosio_bench::run_socket_throughput_benchmarks<Context>( collector, bench_filter );
+
+    if( run_all || std::strcmp( category_filter, "socket_latency" ) == 0 )
+        corosio_bench::run_socket_latency_benchmarks<Context>( collector, bench_filter );
+
+    if( run_all || std::strcmp( category_filter, "http_server" ) == 0 )
+        corosio_bench::run_http_server_benchmarks<Context>( collector, bench_filter );
+
+    std::cout << "\nBenchmarks complete.\n";
+
+    if( output_file )
+    {
+        if( collector.write_json( output_file ) )
+            std::cout << "Results written to: " << output_file << "\n";
+        else
+            std::cerr << "Error: Failed to write results to: " << output_file << "\n";
+    }
+}
+
+void print_usage( char const* program_name )
+{
+    std::cout << "Usage: " << program_name << " [OPTIONS]\n\n";
+    std::cout << "Options:\n";
+    std::cout << "  --backend <name>    Select I/O backend (default: platform default)\n";
+    std::cout << "  --category <name>   Run only the specified benchmark category\n";
+    std::cout << "  --bench <name>      Run only the specified benchmark within category\n";
+    std::cout << "  --output <file>     Write JSON results to file\n";
+    std::cout << "  --list              List available backends\n";
+    std::cout << "  --help              Show this help message\n";
+    std::cout << "\n";
+    std::cout << "Benchmark categories:\n";
+    std::cout << "  io_context          io_context handler throughput tests\n";
+    std::cout << "  socket_throughput   Socket throughput tests\n";
+    std::cout << "  socket_latency      Socket latency tests\n";
+    std::cout << "  http_server         HTTP server benchmarks\n";
+    std::cout << "  all                 Run all categories (default)\n";
+    std::cout << "\n";
+    std::cout << "Individual benchmarks (--bench):\n";
+    std::cout << "  io_context:         single_threaded, multithreaded, interleaved, concurrent\n";
+    std::cout << "  socket_throughput:  unidirectional, bidirectional\n";
+    std::cout << "  socket_latency:     pingpong, concurrent\n";
+    std::cout << "  http_server:        single_conn, concurrent, multithread\n";
+    std::cout << "\n";
+    bench::print_available_backends();
+}
+
+} // anonymous namespace
+
+int main( int argc, char* argv[] )
+{
+    char const* backend = nullptr;
+    char const* output_file = nullptr;
+    char const* category_filter = nullptr;
+    char const* bench_filter = nullptr;
+
+    for( int i = 1; i < argc; ++i )
+    {
+        if( std::strcmp( argv[i], "--backend" ) == 0 )
+        {
+            if( i + 1 < argc )
+            {
+                backend = argv[++i];
+            }
+            else
+            {
+                std::cerr << "Error: --backend requires an argument\n";
+                return 1;
+            }
+        }
+        else if( std::strcmp( argv[i], "--category" ) == 0 )
+        {
+            if( i + 1 < argc )
+            {
+                category_filter = argv[++i];
+            }
+            else
+            {
+                std::cerr << "Error: --category requires an argument\n";
+                return 1;
+            }
+        }
+        else if( std::strcmp( argv[i], "--bench" ) == 0 )
+        {
+            if( i + 1 < argc )
+            {
+                bench_filter = argv[++i];
+            }
+            else
+            {
+                std::cerr << "Error: --bench requires an argument\n";
+                return 1;
+            }
+        }
+        else if( std::strcmp( argv[i], "--output" ) == 0 )
+        {
+            if( i + 1 < argc )
+            {
+                output_file = argv[++i];
+            }
+            else
+            {
+                std::cerr << "Error: --output requires an argument\n";
+                return 1;
+            }
+        }
+        else if( std::strcmp( argv[i], "--list" ) == 0 )
+        {
+            bench::print_available_backends();
+            return 0;
+        }
+        else if( std::strcmp( argv[i], "--help" ) == 0 || std::strcmp( argv[i], "-h" ) == 0 )
+        {
+            print_usage( argv[0] );
+            return 0;
+        }
+        else
+        {
+            std::cerr << "Unknown option: " << argv[i] << "\n";
+            print_usage( argv[0] );
+            return 1;
+        }
+    }
+
+    if( !backend )
+        backend = bench::default_backend_name();
+
+    return bench::dispatch_backend( backend,
+        [=]<typename Context>( char const* name )
+        {
+            run_benchmarks<Context>( name, output_file, category_filter, bench_filter );
+        } );
+}
diff --git a/bench/corosio/socket_latency_bench.cpp b/bench/corosio/socket_latency_bench.cpp
index 9e15aa30..9ecd0c84 100644
--- a/bench/corosio/socket_latency_bench.cpp
+++ b/bench/corosio/socket_latency_bench.cpp
@@ -7,7 +7,10 @@
 // Official repository: https://github.com/cppalliance/corosio
 //
 
+#include "benchmarks.hpp"
+
 #include <boost/corosio/io_context.hpp>
+#include <boost/corosio/detail/platform.hpp>
 #include <boost/corosio/tcp_socket.hpp>
 #include <boost/corosio/test/socket_pair.hpp>
 #include <boost/capy/buffers.hpp>
@@ -20,112 +23,97 @@
 #include <iostream>
 #include <vector>
 
-#include "../common/backend_selection.hpp"
 #include "../common/benchmark.hpp"
 
 namespace corosio = boost::corosio;
 namespace capy = boost::capy;
 
-// Ping-pong coroutine task
+namespace corosio_bench {
+namespace {
+
 capy::task<> pingpong_task(
     corosio::tcp_socket& client,
     corosio::tcp_socket& server,
     std::size_t message_size,
     int iterations,
-    bench::statistics& stats)
+    bench::statistics& stats )
 {
-    std::vector<char> send_buf(message_size, 'P');
-    std::vector<char> recv_buf(message_size);
+    std::vector<char> send_buf( message_size, 'P' );
+    std::vector<char> recv_buf( message_size );
 
-    for (int i = 0; i < iterations; ++i)
+    for( int i = 0; i < iterations; ++i )
     {
         bench::stopwatch sw;
 
-        // Client sends ping
         auto [ec1, n1] = co_await capy::write(
-            client, capy::const_buffer(send_buf.data(), send_buf.size()));
-        if (ec1)
+            client, capy::const_buffer( send_buf.data(), send_buf.size() ) );
+        if( ec1 )
         {
             std::cerr << "    Write error: " << ec1.message() << "\n";
             co_return;
         }
 
-        // Server receives ping
         auto [ec2, n2] = co_await capy::read(
-            server, capy::mutable_buffer(recv_buf.data(), recv_buf.size()));
-        if (ec2)
+            server, capy::mutable_buffer( recv_buf.data(), recv_buf.size() ) );
+        if( ec2 )
         {
             std::cerr << "    Server read error: " << ec2.message() << "\n";
             co_return;
         }
 
-        // Server sends pong
         auto [ec3, n3] = co_await capy::write(
-            server, capy::const_buffer(recv_buf.data(), n2));
-        if (ec3)
+            server, capy::const_buffer( recv_buf.data(), n2 ) );
+        if( ec3 )
         {
             std::cerr << "    Server write error: " << ec3.message() << "\n";
             co_return;
         }
 
-        // Client receives pong
         auto [ec4, n4] = co_await capy::read(
-            client, capy::mutable_buffer(recv_buf.data(), recv_buf.size()));
-        if (ec4)
+            client, capy::mutable_buffer( recv_buf.data(), recv_buf.size() ) );
+        if( ec4 )
         {
             std::cerr << "    Client read error: " << ec4.message() << "\n";
             co_return;
         }
 
         double rtt_us = sw.elapsed_us();
-        stats.add(rtt_us);
+        stats.add( rtt_us );
     }
 }
 
-// Measures round-trip latency for a request-response pattern over loopback sockets.
-// Client sends a message, server echoes it back, measuring the complete cycle time.
-// This is the fundamental latency metric for RPC-style protocols. Reports mean,
-// median (p50), and tail latencies (p99, p99.9) which are critical for SLA compliance.
-// Different message sizes reveal fixed overhead vs. size-dependent costs.
 template<typename Context>
-bench::benchmark_result bench_pingpong_latency(std::size_t message_size, int iterations)
+bench::benchmark_result bench_pingpong_latency( std::size_t message_size, int iterations )
 {
     std::cout << "  Message size: " << message_size << " bytes, ";
     std::cout << "Iterations: " << iterations << "\n";
 
     Context ioc;
-    auto [client, server] = corosio::test::make_socket_pair(ioc);
+    auto [client, server] = corosio::test::make_socket_pair( ioc );
 
-    // Disable Nagle's algorithm for low latency
-    client.set_no_delay(true);
-    server.set_no_delay(true);
+    client.set_no_delay( true );
+    server.set_no_delay( true );
 
     bench::statistics latency_stats;
 
-    capy::run_async(ioc.get_executor())(
-        pingpong_task(client, server, message_size, iterations, latency_stats));
+    capy::run_async( ioc.get_executor() )(
+        pingpong_task( client, server, message_size, iterations, latency_stats ) );
     ioc.run();
 
-    bench::print_latency_stats(latency_stats, "Round-trip latency");
+    bench::print_latency_stats( latency_stats, "Round-trip latency" );
     std::cout << "\n";
 
     client.close();
     server.close();
 
-    return bench::benchmark_result("pingpong_" + std::to_string(message_size))
-        .add("message_size", static_cast<double>(message_size))
-        .add("iterations", iterations)
-        .add_latency_stats("rtt", latency_stats);
+    return bench::benchmark_result( "pingpong_" + std::to_string( message_size ) )
+        .add( "message_size", static_cast<double>( message_size ) )
+        .add( "iterations", iterations )
+        .add_latency_stats( "rtt", latency_stats );
 }
 
-// Measures latency degradation under concurrent connection load. Multiple socket
-// pairs perform ping-pong simultaneously, revealing how latency increases as the
-// scheduler multiplexes more connections. Critical for capacity planning: shows
-// how many concurrent connections can be sustained before latency becomes
-// unacceptable. A well-designed scheduler should show gradual degradation rather
-// than sudden latency spikes.
 template<typename Context>
-bench::benchmark_result bench_concurrent_latency(int num_pairs, std::size_t message_size, int iterations)
+bench::benchmark_result bench_concurrent_latency( int num_pairs, std::size_t message_size, int iterations )
 {
     std::cout << "  Concurrent pairs: " << num_pairs << ", ";
     std::cout << "Message size: " << message_size << " bytes, ";
@@ -133,200 +121,108 @@ bench::benchmark_result bench_concurrent_latency(int num_pairs, std::size_t mess
 
     Context ioc;
 
-    // Store sockets and stats separately for safe reference passing
     std::vector<corosio::tcp_socket> clients;
     std::vector<corosio::tcp_socket> servers;
-    std::vector<bench::statistics> stats(num_pairs);
+    std::vector<bench::statistics> stats( num_pairs );
 
-    clients.reserve(num_pairs);
-    servers.reserve(num_pairs);
+    clients.reserve( num_pairs );
+    servers.reserve( num_pairs );
 
-    for (int i = 0; i < num_pairs; ++i)
+    for( int i = 0; i < num_pairs; ++i )
     {
-        auto [c, s] = corosio::test::make_socket_pair(ioc);
-        // Disable Nagle's algorithm for low latency
-        c.set_no_delay(true);
-        s.set_no_delay(true);
-        clients.push_back(std::move(c));
-        servers.push_back(std::move(s));
+        auto [c, s] = corosio::test::make_socket_pair( ioc );
+        c.set_no_delay( true );
+        s.set_no_delay( true );
+        clients.push_back( std::move( c ) );
+        servers.push_back( std::move( s ) );
     }
 
-    // Launch concurrent ping-pong tasks
-    for (int p = 0; p < num_pairs; ++p)
+    for( int p = 0; p < num_pairs; ++p )
     {
-        capy::run_async(ioc.get_executor())(
-            pingpong_task(clients[p], servers[p], message_size, iterations, stats[p]));
+        capy::run_async( ioc.get_executor() )(
+            pingpong_task( clients[p], servers[p], message_size, iterations, stats[p] ) );
     }
 
     ioc.run();
 
     std::cout << "  Per-pair results:\n";
-    for (int i = 0; i < num_pairs && i < 3; ++i)
+    for( int i = 0; i < num_pairs && i < 3; ++i )
     {
         std::cout << "    Pair " << i << ": mean="
-                  << bench::format_latency(stats[i].mean())
-                  << ", p99=" << bench::format_latency(stats[i].p99())
+                  << bench::format_latency( stats[i].mean() )
+                  << ", p99=" << bench::format_latency( stats[i].p99() )
                   << "\n";
     }
-    if (num_pairs > 3)
-        std::cout << "    ... (" << (num_pairs - 3) << " more pairs)\n";
+    if( num_pairs > 3 )
+        std::cout << "    ... (" << ( num_pairs - 3 ) << " more pairs)\n";
 
-    // Calculate average across all pairs
     double total_mean = 0;
     double total_p99 = 0;
-    for (auto& s : stats)
+    for( auto& s : stats )
     {
         total_mean += s.mean();
         total_p99 += s.p99();
     }
     std::cout << "  Average mean latency: "
-              << bench::format_latency(total_mean / num_pairs) << "\n";
+              << bench::format_latency( total_mean / num_pairs ) << "\n";
     std::cout << "  Average p99 latency:  "
-              << bench::format_latency(total_p99 / num_pairs) << "\n\n";
+              << bench::format_latency( total_p99 / num_pairs ) << "\n\n";
 
-    for (auto& c : clients)
+    for( auto& c : clients )
         c.close();
-    for (auto& s : servers)
+    for( auto& s : servers )
         s.close();
 
-    return bench::benchmark_result("concurrent_" + std::to_string(num_pairs) + "_pairs")
-        .add("num_pairs", num_pairs)
-        .add("message_size", static_cast<double>(message_size))
-        .add("iterations", iterations)
-        .add("avg_mean_latency_us", total_mean / num_pairs)
-        .add("avg_p99_latency_us", total_p99 / num_pairs);
+    return bench::benchmark_result( "concurrent_" + std::to_string( num_pairs ) + "_pairs" )
+        .add( "num_pairs", num_pairs )
+        .add( "message_size", static_cast<double>( message_size ) )
+        .add( "iterations", iterations )
+        .add( "avg_mean_latency_us", total_mean / num_pairs )
+        .add( "avg_p99_latency_us", total_p99 / num_pairs );
 }
 
-// Run benchmarks for a specific context type
+} // anonymous namespace
+
 template<typename Context>
-void run_benchmarks(const char* backend_name, const char* output_file, const char* bench_filter)
+void run_socket_latency_benchmarks(
+    bench::result_collector& collector,
+    char const* filter )
 {
-    std::cout << "Boost.Corosio Socket Latency Benchmarks\n";
-    std::cout << "=======================================\n";
-    std::cout << "Backend: " << backend_name << "\n\n";
+    std::cout << "\n>>> Socket Latency Benchmarks <<<\n";
 
-    bench::result_collector collector(backend_name);
+    bool run_all = !filter || std::strcmp( filter, "all" ) == 0;
 
-    bool run_all = !bench_filter || std::strcmp(bench_filter, "all") == 0;
-
-    // Variable message sizes
-    std::vector<std::size_t> message_sizes = {1, 64, 1024};
+    std::vector<std::size_t> message_sizes = { 1, 64, 1024 };
     int iterations = 1000;
 
-    if (run_all || std::strcmp(bench_filter, "pingpong") == 0)
+    if( run_all || std::strcmp( filter, "pingpong" ) == 0 )
     {
-        bench::print_header("Ping-Pong Round-Trip Latency");
-        for (auto size : message_sizes)
-            collector.add(bench_pingpong_latency<Context>(size, iterations));
+        bench::print_header( "Ping-Pong Round-Trip Latency" );
+        for( auto size : message_sizes )
+            collector.add( bench_pingpong_latency<Context>( size, iterations ) );
     }
 
-    if (run_all || std::strcmp(bench_filter, "concurrent") == 0)
+    if( run_all || std::strcmp( filter, "concurrent" ) == 0 )
     {
-        bench::print_header("Concurrent Socket Pairs Latency");
-        collector.add(bench_concurrent_latency<Context>(1, 64, 1000));
-        collector.add(bench_concurrent_latency<Context>(4, 64, 500));
-        collector.add(bench_concurrent_latency<Context>(16, 64, 250));
-    }
-
-    std::cout << "\nBenchmarks complete.\n";
-
-    if (output_file)
-    {
-        if (collector.write_json(output_file))
-            std::cout << "Results written to: " << output_file << "\n";
-        else
-            std::cerr << "Error: Failed to write results to: " << output_file << "\n";
+        bench::print_header( "Concurrent Socket Pairs Latency" );
+        collector.add( bench_concurrent_latency<Context>( 1, 64, 1000 ) );
+        collector.add( bench_concurrent_latency<Context>( 4, 64, 500 ) );
+        collector.add( bench_concurrent_latency<Context>( 16, 64, 250 ) );
     }
 }
 
-void print_usage(const char* program_name)
-{
-    std::cout << "Usage: " << program_name << " [OPTIONS]\n\n";
-    std::cout << "Options:\n";
-    std::cout << "  --backend <name>   Select I/O backend (default: platform default)\n";
-    std::cout << "  --bench <name>     Run only the specified benchmark\n";
-    std::cout << "  --output <file>    Write JSON results to file\n";
-    std::cout << "  --list             List available backends\n";
-    std::cout << "  --help             Show this help message\n";
-    std::cout << "\n";
-    std::cout << "Available benchmarks:\n";
-    std::cout << "  pingpong           Ping-pong round-trip latency (various message sizes)\n";
-    std::cout << "  concurrent         Concurrent socket pairs latency\n";
-    std::cout << "  all                Run all benchmarks (default)\n";
-    std::cout << "\n";
-    bench::print_available_backends();
-}
-
-int main(int argc, char* argv[])
-{
-    const char* backend = nullptr;
-    const char* output_file = nullptr;
-    const char* bench_filter = nullptr;
-
-    for (int i = 1; i < argc; ++i)
-    {
-        if (std::strcmp(argv[i], "--backend") == 0)
-        {
-            if (i + 1 < argc)
-            {
-                backend = argv[++i];
-            }
-            else
-            {
-                std::cerr << "Error: --backend requires an argument\n";
-                return 1;
-            }
-        }
-        else if (std::strcmp(argv[i], "--bench") == 0)
-        {
-            if (i + 1 < argc)
-            {
-                bench_filter = argv[++i];
-            }
-            else
-            {
-                std::cerr << "Error: --bench requires an argument\n";
-                return 1;
-            }
-        }
-        else if (std::strcmp(argv[i], "--output") == 0)
-        {
-            if (i + 1 < argc)
-            {
-                output_file = argv[++i];
-            }
-            else
-            {
-                std::cerr << "Error: --output requires an argument\n";
-                return 1;
-            }
-        }
-        else if (std::strcmp(argv[i], "--list") == 0)
-        {
-            bench::print_available_backends();
-            return 0;
-        }
-        else if (std::strcmp(argv[i], "--help") == 0 || std::strcmp(argv[i], "-h") == 0)
-        {
-            print_usage(argv[0]);
-            return 0;
-        }
-        else
-        {
-            std::cerr << "Unknown option: " << argv[i] << "\n";
-            print_usage(argv[0]);
-            return 1;
-        }
-    }
-
-    // If no backend specified, use platform default
-    if (!backend)
-        backend = bench::default_backend_name();
-
-    // Dispatch to the selected backend using a generic lambda
-    return bench::dispatch_backend(backend,
-        [=]<typename Context>(const char* name)
-        {
-            run_benchmarks<Context>(name, output_file, bench_filter);
-        });
-}
+// Explicit instantiations
+#if BOOST_COROSIO_HAS_EPOLL
+template void run_socket_latency_benchmarks<corosio::epoll_context>(
+    bench::result_collector&, char const* );
+#endif
+#if BOOST_COROSIO_HAS_SELECT
+template void run_socket_latency_benchmarks<corosio::select_context>(
+    bench::result_collector&, char const* );
+#endif
+#if BOOST_COROSIO_HAS_IOCP
+template void run_socket_latency_benchmarks<corosio::iocp_context>(
+    bench::result_collector&, char const* );
+#endif
+
+} // namespace corosio_bench
diff --git a/bench/corosio/socket_throughput_bench.cpp b/bench/corosio/socket_throughput_bench.cpp
index 6ecbd030..919a3535 100644
--- a/bench/corosio/socket_throughput_bench.cpp
+++ b/bench/corosio/socket_throughput_bench.cpp
@@ -7,6 +7,8 @@
 // Official repository: https://github.com/cppalliance/corosio
 //
 
+#include "benchmarks.hpp"
+
 #include <boost/corosio/io_context.hpp>
 #include <boost/corosio/detail/platform.hpp>
 #include <boost/corosio/tcp_socket.hpp>
@@ -28,58 +30,52 @@
 #include <sys/socket.h>
 #endif
 
-#include "../common/backend_selection.hpp"
 #include "../common/benchmark.hpp"
 
 namespace corosio = boost::corosio;
 namespace capy = boost::capy;
 
-// Helper to set TCP_NODELAY on a socket for low latency
-inline void set_nodelay(corosio::tcp_socket& s)
+namespace corosio_bench {
+namespace {
+
+inline void set_nodelay( corosio::tcp_socket& s )
 {
     int flag = 1;
 #if BOOST_COROSIO_HAS_IOCP
-    ::setsockopt(static_cast<SOCKET>(s.native_handle()), IPPROTO_TCP, TCP_NODELAY,
-                 reinterpret_cast<const char*>(&flag), sizeof(flag));
+    ::setsockopt( static_cast<SOCKET>( s.native_handle() ), IPPROTO_TCP, TCP_NODELAY,
+                  reinterpret_cast<char const*>( &flag ), sizeof( flag ) );
 #else
-    ::setsockopt(s.native_handle(), IPPROTO_TCP, TCP_NODELAY, &flag, sizeof(flag));
+    ::setsockopt( s.native_handle(), IPPROTO_TCP, TCP_NODELAY, &flag, sizeof( flag ) );
 #endif
 }
 
-// Measures maximum unidirectional data transfer rate over a loopback socket pair.
-// One coroutine writes while another reads, testing the efficiency of async I/O
-// operations. Runs with different buffer sizes to reveal the optimal chunk size
-// for this platform. Small buffers stress syscall overhead; large buffers approach
-// memory bandwidth limits. Useful for tuning buffer sizes in streaming protocols.
 template<typename Context>
-bench::benchmark_result bench_throughput(std::size_t chunk_size, std::size_t total_bytes)
+bench::benchmark_result bench_throughput( std::size_t chunk_size, std::size_t total_bytes )
 {
     std::cout << "  Buffer size: " << chunk_size << " bytes, ";
-    std::cout << "Transfer: " << (total_bytes / (1024 * 1024)) << " MB\n";
+    std::cout << "Transfer: " << ( total_bytes / ( 1024 * 1024 ) ) << " MB\n";
 
     Context ioc;
-    auto [writer, reader] = corosio::test::make_socket_pair(ioc);
+    auto [writer, reader] = corosio::test::make_socket_pair( ioc );
 
-    // Disable Nagle's algorithm for fair comparison with Asio
-    set_nodelay(writer);
-    set_nodelay(reader);
+    set_nodelay( writer );
+    set_nodelay( reader );
 
-    std::vector<char> write_buf(chunk_size, 'x');
-    std::vector<char> read_buf(chunk_size);
+    std::vector<char> write_buf( chunk_size, 'x' );
+    std::vector<char> read_buf( chunk_size );
 
     std::size_t total_written = 0;
     std::size_t total_read = 0;
     bool writer_done = false;
 
-    // Writer coroutine
     auto write_task = [&]() -> capy::task<>
     {
-        while (total_written < total_bytes)
+        while( total_written < total_bytes )
         {
-            std::size_t to_write = (std::min)(chunk_size, total_bytes - total_written);
+            std::size_t to_write = ( std::min )( chunk_size, total_bytes - total_written );
             auto [ec, n] = co_await writer.write_some(
-                capy::const_buffer(write_buf.data(), to_write));
-            if (ec)
+                capy::const_buffer( write_buf.data(), to_write ) );
+            if( ec )
             {
                 std::cerr << "    Write error: " << ec.message() << "\n";
                 break;
@@ -87,24 +83,23 @@ bench::benchmark_result bench_throughput(std::size_t chunk_size, std::size_t tot
             total_written += n;
         }
         writer_done = true;
-        writer.shutdown(corosio::tcp_socket::shutdown_send);
+        writer.shutdown( corosio::tcp_socket::shutdown_send );
     };
 
-    // Reader coroutine
     auto read_task = [&]() -> capy::task<>
     {
-        while (total_read < total_bytes)
+        while( total_read < total_bytes )
         {
             auto [ec, n] = co_await reader.read_some(
-                capy::mutable_buffer(read_buf.data(), read_buf.size()));
-            if (ec)
+                capy::mutable_buffer( read_buf.data(), read_buf.size() ) );
+            if( ec )
             {
-                if (writer_done && total_read >= total_bytes)
+                if( writer_done && total_read >= total_bytes )
                     break;
                 std::cerr << "    Read error: " << ec.message() << "\n";
                 break;
             }
-            if (n == 0)
+            if( n == 0 )
                 break;
             total_read += n;
         }
@@ -112,271 +107,173 @@ bench::benchmark_result bench_throughput(std::size_t chunk_size, std::size_t tot
 
     bench::stopwatch sw;
 
-    capy::run_async(ioc.get_executor())(write_task());
-    capy::run_async(ioc.get_executor())(read_task());
+    capy::run_async( ioc.get_executor() )( write_task() );
+    capy::run_async( ioc.get_executor() )( read_task() );
     ioc.run();
 
     double elapsed = sw.elapsed_seconds();
-    double throughput = static_cast<double>(total_read) / elapsed;
+    double throughput = static_cast<double>( total_read ) / elapsed;
 
     std::cout << "    Written:    " << total_written << " bytes\n";
     std::cout << "    Read:       " << total_read << " bytes\n";
-    std::cout << "    Elapsed:    " << std::fixed << std::setprecision(3)
+    std::cout << "    Elapsed:    " << std::fixed << std::setprecision( 3 )
               << elapsed << " s\n";
-    std::cout << "    Throughput: " << bench::format_throughput(throughput) << "\n\n";
+    std::cout << "    Throughput: " << bench::format_throughput( throughput ) << "\n\n";
 
     writer.close();
     reader.close();
 
-    return bench::benchmark_result("throughput_" + std::to_string(chunk_size))
-        .add("chunk_size", static_cast<double>(chunk_size))
-        .add("total_bytes", static_cast<double>(total_bytes))
-        .add("bytes_written", static_cast<double>(total_written))
-        .add("bytes_read", static_cast<double>(total_read))
-        .add("elapsed_s", elapsed)
-        .add("throughput_bytes_per_sec", throughput);
+    return bench::benchmark_result( "throughput_" + std::to_string( chunk_size ) )
+        .add( "chunk_size", static_cast<double>( chunk_size ) )
+        .add( "total_bytes", static_cast<double>( total_bytes ) )
+        .add( "bytes_written", static_cast<double>( total_written ) )
+        .add( "bytes_read", static_cast<double>( total_read ) )
+        .add( "elapsed_s", elapsed )
+        .add( "throughput_bytes_per_sec", throughput );
 }
 
-// Measures full-duplex throughput with both endpoints sending and receiving
-// simultaneously. Four concurrent coroutines (two writers, two readers) stress
-// the scheduler's ability to multiplex I/O efficiently. This pattern is common
-// in protocols like WebSocket or gRPC where data flows in both directions.
-// Combined throughput should ideally approach 2x unidirectional throughput.
 template<typename Context>
-bench::benchmark_result bench_bidirectional_throughput(std::size_t chunk_size, std::size_t total_bytes)
+bench::benchmark_result bench_bidirectional_throughput( std::size_t chunk_size, std::size_t total_bytes )
 {
     std::cout << "  Buffer size: " << chunk_size << " bytes, ";
-    std::cout << "Transfer: " << (total_bytes / (1024 * 1024)) << " MB each direction\n";
+    std::cout << "Transfer: " << ( total_bytes / ( 1024 * 1024 ) ) << " MB each direction\n";
 
     Context ioc;
-    auto [sock1, sock2] = corosio::test::make_socket_pair(ioc);
+    auto [sock1, sock2] = corosio::test::make_socket_pair( ioc );
 
-    // Disable Nagle's algorithm for fair comparison with Asio
-    set_nodelay(sock1);
-    set_nodelay(sock2);
+    set_nodelay( sock1 );
+    set_nodelay( sock2 );
 
-    std::vector<char> buf1(chunk_size, 'a');
-    std::vector<char> buf2(chunk_size, 'b');
+    std::vector<char> buf1( chunk_size, 'a' );
+    std::vector<char> buf2( chunk_size, 'b' );
 
     std::size_t written1 = 0, read1 = 0;
     std::size_t written2 = 0, read2 = 0;
 
-    // Socket 1 writes to socket 2
     auto write1_task = [&]() -> capy::task<>
     {
-        while (written1 < total_bytes)
+        while( written1 < total_bytes )
         {
-            std::size_t to_write = (std::min)(chunk_size, total_bytes - written1);
+            std::size_t to_write = ( std::min )( chunk_size, total_bytes - written1 );
             auto [ec, n] = co_await sock1.write_some(
-                capy::const_buffer(buf1.data(), to_write));
-            if (ec) break;
+                capy::const_buffer( buf1.data(), to_write ) );
+            if( ec ) break;
             written1 += n;
         }
-        sock1.shutdown(corosio::tcp_socket::shutdown_send);
+        sock1.shutdown( corosio::tcp_socket::shutdown_send );
     };
 
-    // Socket 2 reads from socket 1
     auto read1_task = [&]() -> capy::task<>
     {
-        std::vector<char> rbuf(chunk_size);
-        while (read1 < total_bytes)
+        std::vector<char> rbuf( chunk_size );
+        while( read1 < total_bytes )
         {
             auto [ec, n] = co_await sock2.read_some(
-                capy::mutable_buffer(rbuf.data(), rbuf.size()));
-            if (ec || n == 0) break;
+                capy::mutable_buffer( rbuf.data(), rbuf.size() ) );
+            if( ec || n == 0 ) break;
             read1 += n;
         }
     };
 
-    // Socket 2 writes to socket 1
     auto write2_task = [&]() -> capy::task<>
     {
-        while (written2 < total_bytes)
+        while( written2 < total_bytes )
         {
-            std::size_t to_write = (std::min)(chunk_size, total_bytes - written2);
+            std::size_t to_write = ( std::min )( chunk_size, total_bytes - written2 );
             auto [ec, n] = co_await sock2.write_some(
-                capy::const_buffer(buf2.data(), to_write));
-            if (ec) break;
+                capy::const_buffer( buf2.data(), to_write ) );
+            if( ec ) break;
             written2 += n;
         }
-        sock2.shutdown(corosio::tcp_socket::shutdown_send);
+        sock2.shutdown( corosio::tcp_socket::shutdown_send );
     };
 
-    // Socket 1 reads from socket 2
     auto read2_task = [&]() -> capy::task<>
     {
-        std::vector<char> rbuf(chunk_size);
-        while (read2 < total_bytes)
+        std::vector<char> rbuf( chunk_size );
+        while( read2 < total_bytes )
         {
             auto [ec, n] = co_await sock1.read_some(
-                capy::mutable_buffer(rbuf.data(), rbuf.size()));
-            if (ec || n == 0) break;
+                capy::mutable_buffer( rbuf.data(), rbuf.size() ) );
+            if( ec || n == 0 ) break;
             read2 += n;
         }
     };
 
     bench::stopwatch sw;
 
-    capy::run_async(ioc.get_executor())(write1_task());
-    capy::run_async(ioc.get_executor())(read1_task());
-    capy::run_async(ioc.get_executor())(write2_task());
-    capy::run_async(ioc.get_executor())(read2_task());
+    capy::run_async( ioc.get_executor() )( write1_task() );
+    capy::run_async( ioc.get_executor() )( read1_task() );
+    capy::run_async( ioc.get_executor() )( write2_task() );
+    capy::run_async( ioc.get_executor() )( read2_task() );
     ioc.run();
 
     double elapsed = sw.elapsed_seconds();
     std::size_t total_transferred = read1 + read2;
-    double throughput = static_cast<double>(total_transferred) / elapsed;
+    double throughput = static_cast<double>( total_transferred ) / elapsed;
 
     std::cout << "    Direction 1: " << read1 << " bytes\n";
     std::cout << "    Direction 2: " << read2 << " bytes\n";
     std::cout << "    Total:       " << total_transferred << " bytes\n";
-    std::cout << "    Elapsed:     " << std::fixed << std::setprecision(3)
+    std::cout << "    Elapsed:     " << std::fixed << std::setprecision( 3 )
               << elapsed << " s\n";
-    std::cout << "    Throughput:  " << bench::format_throughput(throughput)
+    std::cout << "    Throughput:  " << bench::format_throughput( throughput )
               << " (combined)\n\n";
 
     sock1.close();
     sock2.close();
 
-    return bench::benchmark_result("bidirectional_" + std::to_string(chunk_size))
-        .add("chunk_size", static_cast<double>(chunk_size))
-        .add("total_bytes_per_direction", static_cast<double>(total_bytes))
-        .add("bytes_direction1", static_cast<double>(read1))
-        .add("bytes_direction2", static_cast<double>(read2))
-        .add("total_transferred", static_cast<double>(total_transferred))
-        .add("elapsed_s", elapsed)
-        .add("throughput_bytes_per_sec", throughput);
+    return bench::benchmark_result( "bidirectional_" + std::to_string( chunk_size ) )
+        .add( "chunk_size", static_cast<double>( chunk_size ) )
+        .add( "total_bytes_per_direction", static_cast<double>( total_bytes ) )
+        .add( "bytes_direction1", static_cast<double>( read1 ) )
+        .add( "bytes_direction2", static_cast<double>( read2 ) )
+        .add( "total_transferred", static_cast<double>( total_transferred ) )
+        .add( "elapsed_s", elapsed )
+        .add( "throughput_bytes_per_sec", throughput );
 }
 
-// Run benchmarks for a specific context type
+} // anonymous namespace
+
 template<typename Context>
-void run_benchmarks(const char* backend_name, const char* output_file, const char* bench_filter)
+void run_socket_throughput_benchmarks(
+    bench::result_collector& collector,
+    char const* filter )
 {
-    std::cout << "Boost.Corosio Socket Throughput Benchmarks\n";
-    std::cout << "==========================================\n";
-    std::cout << "Backend: " << backend_name << "\n\n";
+    std::cout << "\n>>> Socket Throughput Benchmarks <<<\n";
 
-    bench::result_collector collector(backend_name);
+    bool run_all = !filter || std::strcmp( filter, "all" ) == 0;
 
-    bool run_all = !bench_filter || std::strcmp(bench_filter, "all") == 0;
-
-    // Variable buffer sizes
-    std::vector<std::size_t> buffer_sizes = {1024, 4096, 16384, 65536};
-    std::size_t transfer_size = 64 * 1024 * 1024; // 64 MB
-
-    if (run_all || std::strcmp(bench_filter, "unidirectional") == 0)
-    {
-        bench::print_header("Unidirectional Throughput");
-        for (auto size : buffer_sizes)
-            collector.add(bench_throughput<Context>(size, transfer_size));
-    }
+    std::vector<std::size_t> buffer_sizes = { 1024, 4096, 16384, 65536 };
+    std::size_t transfer_size = 64 * 1024 * 1024;
 
-    if (run_all || std::strcmp(bench_filter, "bidirectional") == 0)
+    if( run_all || std::strcmp( filter, "unidirectional" ) == 0 )
     {
-        bench::print_header("Bidirectional Throughput");
-        for (auto size : buffer_sizes)
-            collector.add(bench_bidirectional_throughput<Context>(size, transfer_size / 2));
+        bench::print_header( "Unidirectional Throughput" );
+        for( auto size : buffer_sizes )
+            collector.add( bench_throughput<Context>( size, transfer_size ) );
     }
 
-    std::cout << "\nBenchmarks complete.\n";
-
-    if (output_file)
+    if( run_all || std::strcmp( filter, "bidirectional" ) == 0 )
     {
-        if (collector.write_json(output_file))
-            std::cout << "Results written to: " << output_file << "\n";
-        else
-            std::cerr << "Error: Failed to write results to: " << output_file << "\n";
+        bench::print_header( "Bidirectional Throughput" );
+        for( auto size : buffer_sizes )
+            collector.add( bench_bidirectional_throughput<Context>( size, transfer_size / 2 ) );
     }
 }
 
-void print_usage(const char* program_name)
-{
-    std::cout << "Usage: " << program_name << " [OPTIONS]\n\n";
-    std::cout << "Options:\n";
-    std::cout << "  --backend <name>   Select I/O backend (default: platform default)\n";
-    std::cout << "  --bench <name>     Run only the specified benchmark\n";
-    std::cout << "  --output <file>    Write JSON results to file\n";
-    std::cout << "  --list             List available backends\n";
-    std::cout << "  --help             Show this help message\n";
-    std::cout << "\n";
-    std::cout << "Available benchmarks:\n";
-    std::cout << "  unidirectional     Unidirectional throughput (various buffer sizes)\n";
-    std::cout << "  bidirectional      Bidirectional throughput (various buffer sizes)\n";
-    std::cout << "  all                Run all benchmarks (default)\n";
-    std::cout << "\n";
-    bench::print_available_backends();
-}
-
-int main(int argc, char* argv[])
-{
-    const char* backend = nullptr;
-    const char* output_file = nullptr;
-    const char* bench_filter = nullptr;
-
-    for (int i = 1; i < argc; ++i)
-    {
-        if (std::strcmp(argv[i], "--backend") == 0)
-        {
-            if (i + 1 < argc)
-            {
-                backend = argv[++i];
-            }
-            else
-            {
-                std::cerr << "Error: --backend requires an argument\n";
-                return 1;
-            }
-        }
-        else if (std::strcmp(argv[i], "--bench") == 0)
-        {
-            if (i + 1 < argc)
-            {
-                bench_filter = argv[++i];
-            }
-            else
-            {
-                std::cerr << "Error: --bench requires an argument\n";
-                return 1;
-            }
-        }
-        else if (std::strcmp(argv[i], "--output") == 0)
-        {
-            if (i + 1 < argc)
-            {
-                output_file = argv[++i];
-            }
-            else
-            {
-                std::cerr << "Error: --output requires an argument\n";
-                return 1;
-            }
-        }
-        else if (std::strcmp(argv[i], "--list") == 0)
-        {
-            bench::print_available_backends();
-            return 0;
-        }
-        else if (std::strcmp(argv[i], "--help") == 0 || std::strcmp(argv[i], "-h") == 0)
-        {
-            print_usage(argv[0]);
-            return 0;
-        }
-        else
-        {
-            std::cerr << "Unknown option: " << argv[i] << "\n";
-            print_usage(argv[0]);
-            return 1;
-        }
-    }
-
-    // If no backend specified, use platform default
-    if (!backend)
-        backend = bench::default_backend_name();
+// Explicit instantiations
+#if BOOST_COROSIO_HAS_EPOLL
+template void run_socket_throughput_benchmarks<corosio::epoll_context>(
+    bench::result_collector&, char const* );
+#endif
+#if BOOST_COROSIO_HAS_SELECT
+template void run_socket_throughput_benchmarks<corosio::select_context>(
+    bench::result_collector&, char const* );
+#endif
+#if BOOST_COROSIO_HAS_IOCP
+template void run_socket_throughput_benchmarks<corosio::iocp_context>(
+    bench::result_collector&, char const* );
+#endif
 
-    // Dispatch to the selected backend using a generic lambda
-    return bench::dispatch_backend(backend,
-        [=]<typename Context>(const char* name)
-        {
-            run_benchmarks<Context>(name, output_file, bench_filter);
-        });
-}
+} // namespace corosio_bench

From b2c75b11fce04d57858f3789c626ab594920a4dd Mon Sep 17 00:00:00 2001
From: Steve Gerbino <steve@gerbino.co>
Date: Wed, 4 Feb 2026 15:03:20 +0100
Subject: [PATCH 2/2] Add warmup and increase benchmark iterations

- Add warmup phase to all benchmarks to reduce variance
- Remove category headers from output
- Increase iterations for more stable results:
  - io_context: 5M handlers
  - socket_latency: 1M iterations
  - socket_throughput: 4GB transfer
  - http_server: 1M requests
---
 bench/asio/http_server_bench.cpp          | 36 ++++++++++++------
 bench/asio/io_context_bench.cpp           | 10 ++---
 bench/asio/socket_latency_bench.cpp       | 24 +++++++++---
 bench/asio/socket_throughput_bench.cpp    | 15 ++++++--
 bench/corosio/http_server_bench.cpp       | 45 +++++++++++++++++------
 bench/corosio/io_context_bench.cpp        | 10 ++---
 bench/corosio/socket_latency_bench.cpp    | 29 ++++++++++++---
 bench/corosio/socket_throughput_bench.cpp | 20 ++++++++--
 8 files changed, 137 insertions(+), 52 deletions(-)

diff --git a/bench/asio/http_server_bench.cpp b/bench/asio/http_server_bench.cpp
index 17da83f1..34488648 100644
--- a/bench/asio/http_server_bench.cpp
+++ b/bench/asio/http_server_bench.cpp
@@ -320,14 +320,28 @@ void run_http_server_benchmarks(
     bench::result_collector& collector,
     char const* filter )
 {
-    std::cout << "\n>>> HTTP Server Benchmarks (Asio) <<<\n";
-
     bool run_all = !filter || std::strcmp( filter, "all" ) == 0;
 
+    // Warm up
+    {
+        asio::io_context ioc;
+        auto [c, s] = make_socket_pair( ioc );
+        char buf[256] = {};
+        for( int i = 0; i < 10; ++i )
+        {
+            asio::write( c, asio::buffer( bench::http::small_request, bench::http::small_request_size ) );
+            asio::read( s, asio::buffer( buf, bench::http::small_request_size ) );
+            asio::write( s, asio::buffer( bench::http::small_response, bench::http::small_response_size ) );
+            asio::read( c, asio::buffer( buf, bench::http::small_response_size ) );
+        }
+        c.close();
+        s.close();
+    }
+
     if( run_all || std::strcmp( filter, "single_conn" ) == 0 )
     {
         bench::print_header( "Single Connection (Sequential Requests)" );
-        collector.add( bench_single_connection( 10000 ) );
+        collector.add( bench_single_connection( 1000000 ) );
     }
 
     if( run_all || std::strcmp( filter, "concurrent" ) == 0 )
@@ -335,10 +349,10 @@ void run_http_server_benchmarks(
         if( run_all )
             std::this_thread::sleep_for( std::chrono::seconds( 5 ) );
         bench::print_header( "Concurrent Connections" );
-        collector.add( bench_concurrent_connections( 1, 10000 ) );
-        collector.add( bench_concurrent_connections( 4, 2500 ) );
-        collector.add( bench_concurrent_connections( 16, 625 ) );
-        collector.add( bench_concurrent_connections( 32, 312 ) );
+        collector.add( bench_concurrent_connections( 1, 1000000 ) );
+        collector.add( bench_concurrent_connections( 4, 250000 ) );
+        collector.add( bench_concurrent_connections( 16, 62500 ) );
+        collector.add( bench_concurrent_connections( 32, 31250 ) );
     }
 
     if( run_all || std::strcmp( filter, "multithread" ) == 0 )
@@ -346,10 +360,10 @@ void run_http_server_benchmarks(
         if( run_all )
             std::this_thread::sleep_for( std::chrono::seconds( 5 ) );
         bench::print_header( "Multi-threaded (32 connections, varying threads)" );
-        collector.add( bench_multithread( 1, 32, 312 ) );
-        collector.add( bench_multithread( 2, 32, 312 ) );
-        collector.add( bench_multithread( 4, 32, 312 ) );
-        collector.add( bench_multithread( 8, 32, 312 ) );
+        collector.add( bench_multithread( 1, 32, 31250 ) );
+        collector.add( bench_multithread( 2, 32, 31250 ) );
+        collector.add( bench_multithread( 4, 32, 31250 ) );
+        collector.add( bench_multithread( 8, 32, 31250 ) );
     }
 }
 
diff --git a/bench/asio/io_context_bench.cpp b/bench/asio/io_context_bench.cpp
index 987768bd..0f641708 100644
--- a/bench/asio/io_context_bench.cpp
+++ b/bench/asio/io_context_bench.cpp
@@ -227,8 +227,6 @@ void run_io_context_benchmarks(
     bench::result_collector& collector,
     char const* filter )
 {
-    std::cout << "\n>>> io_context Benchmarks (Asio) <<<\n";
-
     bool run_all = !filter || std::strcmp( filter, "all" ) == 0;
 
     // Warm up
@@ -241,16 +239,16 @@ void run_io_context_benchmarks(
     }
 
     if( run_all || std::strcmp( filter, "single_threaded" ) == 0 )
-        collector.add( bench_single_threaded_post( 1000000 ) );
+        collector.add( bench_single_threaded_post( 5000000 ) );
 
     if( run_all || std::strcmp( filter, "multithreaded" ) == 0 )
-        collector.add( bench_multithreaded_scaling( 1000000, 8 ) );
+        collector.add( bench_multithreaded_scaling( 5000000, 8 ) );
 
     if( run_all || std::strcmp( filter, "interleaved" ) == 0 )
-        collector.add( bench_interleaved_post_run( 10000, 100 ) );
+        collector.add( bench_interleaved_post_run( 50000, 100 ) );
 
     if( run_all || std::strcmp( filter, "concurrent" ) == 0 )
-        collector.add( bench_concurrent_post_run( 4, 250000 ) );
+        collector.add( bench_concurrent_post_run( 4, 1250000 ) );
 }
 
 } // namespace asio_bench
diff --git a/bench/asio/socket_latency_bench.cpp b/bench/asio/socket_latency_bench.cpp
index a8a73453..5a686925 100644
--- a/bench/asio/socket_latency_bench.cpp
+++ b/bench/asio/socket_latency_bench.cpp
@@ -170,12 +170,24 @@ void run_socket_latency_benchmarks(
     bench::result_collector& collector,
     char const* filter )
 {
-    std::cout << "\n>>> Socket Latency Benchmarks (Asio) <<<\n";
-
     bool run_all = !filter || std::strcmp( filter, "all" ) == 0;
 
+    // Warm up
+    {
+        asio::io_context ioc;
+        auto [c, s] = make_socket_pair( ioc );
+        char buf[64] = {};
+        for( int i = 0; i < 100; ++i )
+        {
+            asio::write( c, asio::buffer( buf ) );
+            asio::read( s, asio::buffer( buf ) );
+        }
+        c.close();
+        s.close();
+    }
+
     std::vector<std::size_t> message_sizes = { 1, 64, 1024 };
-    int iterations = 1000;
+    int iterations = 1000000;
 
     if( run_all || std::strcmp( filter, "pingpong" ) == 0 )
     {
@@ -187,9 +199,9 @@ void run_socket_latency_benchmarks(
     if( run_all || std::strcmp( filter, "concurrent" ) == 0 )
     {
         bench::print_header( "Concurrent Socket Pairs Latency (Asio)" );
-        collector.add( bench_concurrent_latency( 1, 64, 1000 ) );
-        collector.add( bench_concurrent_latency( 4, 64, 500 ) );
-        collector.add( bench_concurrent_latency( 16, 64, 250 ) );
+        collector.add( bench_concurrent_latency( 1, 64, 1000000 ) );
+        collector.add( bench_concurrent_latency( 4, 64, 500000 ) );
+        collector.add( bench_concurrent_latency( 16, 64, 250000 ) );
     }
 }
 
diff --git a/bench/asio/socket_throughput_bench.cpp b/bench/asio/socket_throughput_bench.cpp
index c2c51df3..f2b6f577 100644
--- a/bench/asio/socket_throughput_bench.cpp
+++ b/bench/asio/socket_throughput_bench.cpp
@@ -223,12 +223,21 @@ void run_socket_throughput_benchmarks(
     bench::result_collector& collector,
     char const* filter )
 {
-    std::cout << "\n>>> Socket Throughput Benchmarks (Asio) <<<\n";
-
     bool run_all = !filter || std::strcmp( filter, "all" ) == 0;
 
+    // Warm up
+    {
+        asio::io_context ioc;
+        auto [w, r] = make_socket_pair( ioc );
+        std::vector<char> buf( 4096, 'w' );
+        asio::write( w, asio::buffer( buf ) );
+        asio::read( r, asio::buffer( buf ) );
+        w.close();
+        r.close();
+    }
+
     std::vector<std::size_t> buffer_sizes = { 1024, 4096, 16384, 65536 };
-    std::size_t transfer_size = 64 * 1024 * 1024;
+    std::size_t transfer_size = 4ULL * 1024 * 1024 * 1024;
 
     if( run_all || std::strcmp( filter, "unidirectional" ) == 0 )
     {
diff --git a/bench/corosio/http_server_bench.cpp b/bench/corosio/http_server_bench.cpp
index 8f719c7e..4a642786 100644
--- a/bench/corosio/http_server_bench.cpp
+++ b/bench/corosio/http_server_bench.cpp
@@ -320,14 +320,37 @@ void run_http_server_benchmarks(
     bench::result_collector& collector,
     char const* filter )
 {
-    std::cout << "\n>>> HTTP Server Benchmarks <<<\n";
-
     bool run_all = !filter || std::strcmp( filter, "all" ) == 0;
 
+    // Warm up
+    {
+        Context ioc;
+        auto [c, s] = corosio::test::make_socket_pair( ioc );
+        char buf[256] = {};
+        auto task = [&]() -> capy::task<>
+        {
+            for( int i = 0; i < 10; ++i )
+            {
+                (void)co_await capy::write(
+                    c, capy::const_buffer( bench::http::small_request, bench::http::small_request_size ) );
+                (void)co_await s.read_some(
+                    capy::mutable_buffer( buf, bench::http::small_request_size ) );
+                (void)co_await capy::write(
+                    s, capy::const_buffer( bench::http::small_response, bench::http::small_response_size ) );
+                (void)co_await c.read_some(
+                    capy::mutable_buffer( buf, bench::http::small_response_size ) );
+            }
+        };
+        capy::run_async( ioc.get_executor() )( task() );
+        ioc.run();
+        c.close();
+        s.close();
+    }
+
     if( run_all || std::strcmp( filter, "single_conn" ) == 0 )
     {
         bench::print_header( "Single Connection (Sequential Requests)" );
-        collector.add( bench_single_connection<Context>( 10000 ) );
+        collector.add( bench_single_connection<Context>( 1000000 ) );
     }
 
     if( run_all || std::strcmp( filter, "concurrent" ) == 0 )
@@ -335,10 +358,10 @@ void run_http_server_benchmarks(
         if( run_all )
             std::this_thread::sleep_for( std::chrono::seconds( 5 ) );
         bench::print_header( "Concurrent Connections" );
-        collector.add( bench_concurrent_connections<Context>( 1, 10000 ) );
-        collector.add( bench_concurrent_connections<Context>( 4, 2500 ) );
-        collector.add( bench_concurrent_connections<Context>( 16, 625 ) );
-        collector.add( bench_concurrent_connections<Context>( 32, 312 ) );
+        collector.add( bench_concurrent_connections<Context>( 1, 1000000 ) );
+        collector.add( bench_concurrent_connections<Context>( 4, 250000 ) );
+        collector.add( bench_concurrent_connections<Context>( 16, 62500 ) );
+        collector.add( bench_concurrent_connections<Context>( 32, 31250 ) );
     }
 
     if( run_all || std::strcmp( filter, "multithread" ) == 0 )
@@ -346,10 +369,10 @@ void run_http_server_benchmarks(
         if( run_all )
             std::this_thread::sleep_for( std::chrono::seconds( 5 ) );
         bench::print_header( "Multi-threaded (32 connections, varying threads)" );
-        collector.add( bench_multithread<Context>( 1, 32, 312 ) );
-        collector.add( bench_multithread<Context>( 2, 32, 312 ) );
-        collector.add( bench_multithread<Context>( 4, 32, 312 ) );
-        collector.add( bench_multithread<Context>( 8, 32, 312 ) );
+        collector.add( bench_multithread<Context>( 1, 32, 31250 ) );
+        collector.add( bench_multithread<Context>( 2, 32, 31250 ) );
+        collector.add( bench_multithread<Context>( 4, 32, 31250 ) );
+        collector.add( bench_multithread<Context>( 8, 32, 31250 ) );
     }
 }
 
diff --git a/bench/corosio/io_context_bench.cpp b/bench/corosio/io_context_bench.cpp
index 57d033d8..b097761e 100644
--- a/bench/corosio/io_context_bench.cpp
+++ b/bench/corosio/io_context_bench.cpp
@@ -234,8 +234,6 @@ void run_io_context_benchmarks(
     bench::result_collector& collector,
     char const* filter )
 {
-    std::cout << "\n>>> io_context Benchmarks <<<\n";
-
     bool run_all = !filter || std::strcmp( filter, "all" ) == 0;
 
     // Warm up
@@ -249,16 +247,16 @@ void run_io_context_benchmarks(
     }
 
     if( run_all || std::strcmp( filter, "single_threaded" ) == 0 )
-        collector.add( bench_single_threaded_post<Context>( 1000000 ) );
+        collector.add( bench_single_threaded_post<Context>( 5000000 ) );
 
     if( run_all || std::strcmp( filter, "multithreaded" ) == 0 )
-        collector.add( bench_multithreaded_scaling<Context>( 1000000, 8 ) );
+        collector.add( bench_multithreaded_scaling<Context>( 5000000, 8 ) );
 
     if( run_all || std::strcmp( filter, "interleaved" ) == 0 )
-        collector.add( bench_interleaved_post_run<Context>( 10000, 100 ) );
+        collector.add( bench_interleaved_post_run<Context>( 50000, 100 ) );
 
     if( run_all || std::strcmp( filter, "concurrent" ) == 0 )
-        collector.add( bench_concurrent_post_run<Context>( 4, 250000 ) );
+        collector.add( bench_concurrent_post_run<Context>( 4, 1250000 ) );
 }
 
 // Explicit instantiations
diff --git a/bench/corosio/socket_latency_bench.cpp b/bench/corosio/socket_latency_bench.cpp
index 9ecd0c84..c2312aad 100644
--- a/bench/corosio/socket_latency_bench.cpp
+++ b/bench/corosio/socket_latency_bench.cpp
@@ -188,12 +188,29 @@ void run_socket_latency_benchmarks(
     bench::result_collector& collector,
     char const* filter )
 {
-    std::cout << "\n>>> Socket Latency Benchmarks <<<\n";
-
     bool run_all = !filter || std::strcmp( filter, "all" ) == 0;
 
+    // Warm up
+    {
+        Context ioc;
+        auto [c, s] = corosio::test::make_socket_pair( ioc );
+        char buf[64] = {};
+        auto task = [&]() -> capy::task<>
+        {
+            for( int i = 0; i < 100; ++i )
+            {
+                (void)co_await c.write_some( capy::const_buffer( buf, sizeof( buf ) ) );
+                (void)co_await s.read_some( capy::mutable_buffer( buf, sizeof( buf ) ) );
+            }
+        };
+        capy::run_async( ioc.get_executor() )( task() );
+        ioc.run();
+        c.close();
+        s.close();
+    }
+
     std::vector<std::size_t> message_sizes = { 1, 64, 1024 };
-    int iterations = 1000;
+    int iterations = 1000000;
 
     if( run_all || std::strcmp( filter, "pingpong" ) == 0 )
     {
@@ -205,9 +222,9 @@ void run_socket_latency_benchmarks(
     if( run_all || std::strcmp( filter, "concurrent" ) == 0 )
     {
         bench::print_header( "Concurrent Socket Pairs Latency" );
-        collector.add( bench_concurrent_latency<Context>( 1, 64, 1000 ) );
-        collector.add( bench_concurrent_latency<Context>( 4, 64, 500 ) );
-        collector.add( bench_concurrent_latency<Context>( 16, 64, 250 ) );
+        collector.add( bench_concurrent_latency<Context>( 1, 64, 1000000 ) );
+        collector.add( bench_concurrent_latency<Context>( 4, 64, 500000 ) );
+        collector.add( bench_concurrent_latency<Context>( 16, 64, 250000 ) );
     }
 }
 
diff --git a/bench/corosio/socket_throughput_bench.cpp b/bench/corosio/socket_throughput_bench.cpp
index 919a3535..748859d4 100644
--- a/bench/corosio/socket_throughput_bench.cpp
+++ b/bench/corosio/socket_throughput_bench.cpp
@@ -240,12 +240,26 @@ void run_socket_throughput_benchmarks(
     bench::result_collector& collector,
     char const* filter )
 {
-    std::cout << "\n>>> Socket Throughput Benchmarks <<<\n";
-
     bool run_all = !filter || std::strcmp( filter, "all" ) == 0;
 
+    // Warm up
+    {
+        Context ioc;
+        auto [w, r] = corosio::test::make_socket_pair( ioc );
+        std::vector<char> buf( 4096, 'w' );
+        auto task = [&]() -> capy::task<>
+        {
+            (void)co_await w.write_some( capy::const_buffer( buf.data(), buf.size() ) );
+            (void)co_await r.read_some( capy::mutable_buffer( buf.data(), buf.size() ) );
+        };
+        capy::run_async( ioc.get_executor() )( task() );
+        ioc.run();
+        w.close();
+        r.close();
+    }
+
     std::vector<std::size_t> buffer_sizes = { 1024, 4096, 16384, 65536 };
-    std::size_t transfer_size = 64 * 1024 * 1024;
+    std::size_t transfer_size = 4ULL * 1024 * 1024 * 1024;
 
     if( run_all || std::strcmp( filter, "unidirectional" ) == 0 )
     {