diff --git a/.gitlab/generate-tracer.php b/.gitlab/generate-tracer.php index 3c9a3ac6d4..a2a89164bf 100644 --- a/.gitlab/generate-tracer.php +++ b/.gitlab/generate-tracer.php @@ -367,6 +367,16 @@ function before_script_steps($with_docker_auth = false) { ARCH: "amd64" timeout: 120m script: + # Start an always-200 sink on the default agent endpoint (localhost:8126). The bulk of + # the suite runs with the ddtrace default agent (localhost:8126), which has no listener + # here -> those sidecar trace sends ECONNREFUSED and retry, clogging the shared sidecar + # trace-flusher and stalling unrelated tests' synchronous flush past the 10s IPC timeout + # (flaky client_side_stats "Kind(TimedOut)" / "wait for replay timeout"). The sink makes + # those sends succeed instantly. It changes no DD_* env var, so config-observing tests + # (telemetry/config, read_c_configuration) and the RequestReplayer helper are unaffected. + # (make test_extension_ci copies ~hundreds of test files before any test runs, so the + # sink's <100ms bind is comfortably ready before the first trace is sent.) + - python3 .gitlab/scripts/localhost-trace-sink.py & - make test_extension_ci diff --git a/.gitlab/scripts/localhost-trace-sink.py b/.gitlab/scripts/localhost-trace-sink.py new file mode 100644 index 0000000000..784c8dfacd --- /dev/null +++ b/.gitlab/scripts/localhost-trace-sink.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +"""Always-200 trace sink bound to 127.0.0.1:8126 for test_extension_ci. + +Most of the extension test suite runs with the ddtrace default agent endpoint +(DD_AGENT_HOST=localhost, DD_TRACE_AGENT_PORT=8126), which has no listener in the +CI container -- the real services (request-replayer, test-agent) run as separate +containers on their own hostnames. Those sidecar trace sends therefore get +ECONNREFUSED and retry (max_retries=5, exponential backoff). Because the sidecar's +trace flusher is shared across all sessions, those doomed sends stall the +synchronous flush of unrelated tests past the 10s IPC read timeout, producing the +flaky "Failed synchronously flushing traces: Kind(TimedOut)" / +"wait for replay timeout" failures (e.g. client_side_stats). + +This sink answers every request instantly with 200 and an empty JSON body, so those +default sends succeed immediately instead of retry-storming and never clog the shared +flusher. An instant in-process responder (vs redirecting traffic at a real agent) +cannot itself become the bottleneck. + +It deliberately changes no DD_* environment variable, so config-observing tests +(telemetry/config, read_c_configuration) and the RequestReplayer test helper -- which +read DD_AGENT_HOST / DD_TRACE_AGENT_PORT -- are unaffected. +""" +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer + +BODY = b"{}" # empty rate_by_service / info; enough for the sidecar to treat the flush as succeeded + + +class Handler(BaseHTTPRequestHandler): + protocol_version = "HTTP/1.1" # keep-alive so the sidecar can reuse connections + + def _respond(self): + # Drain the request body so keep-alive connections stay in sync. + length = int(self.headers.get("Content-Length") or 0) + if length: + try: + self.rfile.read(length) + except Exception: + pass + try: + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(BODY))) + self.end_headers() + self.wfile.write(BODY) + except Exception: + pass + + do_GET = do_POST = do_PUT = do_DELETE = do_PATCH = _respond + + def log_message(self, format, *args): # silence per-request logging + pass + + +if __name__ == "__main__": + ThreadingHTTPServer(("127.0.0.1", 8126), Handler).serve_forever()