From ff88d90f52a9e0266758ac70731bf0bdc312fc2c Mon Sep 17 00:00:00 2001 From: Alex Fedotyev Date: Wed, 4 Mar 2026 17:03:50 -0800 Subject: [PATCH 1/3] feat: deterministic sampling with adaptive sample size - Replace ORDER BY rand() with cityHash64(SpanId) for deterministic sampling - Replace hardcoded LIMIT 1000 with SAMPLE_SIZE constant - Add computeEffectiveSampleSize() for adaptive sizing (clamp 500-5000) - Add STABLE_SAMPLE_EXPR, SAMPLE_RATIO constants - 6 unit tests for computeEffectiveSampleSize Closes #1827 Co-Authored-By: Claude Opus 4.6 (1M context) --- .changeset/sampling-improvements.md | 5 +++ packages/app/src/components/DBDeltaChart.tsx | 15 ++++---- .../__tests__/deltaChartSampling.test.ts | 36 +++++++++++++++++++ .../app/src/components/deltaChartUtils.ts | 33 +++++++++++++++++ 4 files changed, 83 insertions(+), 6 deletions(-) create mode 100644 .changeset/sampling-improvements.md create mode 100644 packages/app/src/components/__tests__/deltaChartSampling.test.ts diff --git a/.changeset/sampling-improvements.md b/.changeset/sampling-improvements.md new file mode 100644 index 000000000..304caaf40 --- /dev/null +++ b/.changeset/sampling-improvements.md @@ -0,0 +1,5 @@ +--- +"@hyperdx/app": patch +--- + +feat: deterministic sampling with adaptive sample size for Event Deltas diff --git a/packages/app/src/components/DBDeltaChart.tsx b/packages/app/src/components/DBDeltaChart.tsx index fd5bd68de..b6699be8a 100644 --- a/packages/app/src/components/DBDeltaChart.tsx +++ b/packages/app/src/components/DBDeltaChart.tsx @@ -22,10 +22,13 @@ import { getFirstTimestampValueExpression } from '@/source'; import { SQLPreview } from './ChartSQLPreview'; import { + computeEffectiveSampleSize, getPropertyStatistics, isDenylisted, isHighCardinality, mergeValueStatisticsMaps, + SAMPLE_SIZE, + STABLE_SAMPLE_EXPR, } from './deltaChartUtils'; import { CHART_GAP, @@ -136,8 +139,8 @@ export default function DBDeltaChart({ ] : []), ], - orderBy: [{ ordering: 'DESC', valueExpression: 'rand()' }], - limit: { limit: 1000 }, + orderBy: [{ ordering: 'DESC', valueExpression: STABLE_SAMPLE_EXPR }], + limit: { limit: SAMPLE_SIZE }, }, }, ]; @@ -191,8 +194,8 @@ export default function DBDeltaChart({ with: buildWithClauses(true), select: '*', filters: buildFilters(true), - orderBy: [{ ordering: 'DESC', valueExpression: 'rand()' }], - limit: { limit: 1000 }, + orderBy: [{ ordering: 'DESC', valueExpression: STABLE_SAMPLE_EXPR }], + limit: { limit: SAMPLE_SIZE }, }); const { data: inlierData } = useQueriedChartConfig({ @@ -200,8 +203,8 @@ export default function DBDeltaChart({ with: buildWithClauses(false), select: '*', filters: buildFilters(false), - orderBy: [{ ordering: 'DESC', valueExpression: 'rand()' }], - limit: { limit: 1000 }, + orderBy: [{ ordering: 'DESC', valueExpression: STABLE_SAMPLE_EXPR }], + limit: { limit: SAMPLE_SIZE }, }); // Column metadata for field classification (from ClickHouse response) diff --git a/packages/app/src/components/__tests__/deltaChartSampling.test.ts b/packages/app/src/components/__tests__/deltaChartSampling.test.ts new file mode 100644 index 000000000..1a805b920 --- /dev/null +++ b/packages/app/src/components/__tests__/deltaChartSampling.test.ts @@ -0,0 +1,36 @@ +import { + computeEffectiveSampleSize, + MAX_SAMPLE_SIZE, + MIN_SAMPLE_SIZE, + SAMPLE_RATIO, + SAMPLE_SIZE, +} from '../deltaChartUtils'; + +describe('computeEffectiveSampleSize', () => { + it('returns SAMPLE_SIZE when totalCount is 0 (fallback)', () => { + expect(computeEffectiveSampleSize(0)).toBe(SAMPLE_SIZE); + }); + + it('returns SAMPLE_SIZE when totalCount is negative', () => { + expect(computeEffectiveSampleSize(-1)).toBe(SAMPLE_SIZE); + }); + + it('returns MIN_SAMPLE_SIZE for small datasets', () => { + expect(computeEffectiveSampleSize(100)).toBe(MIN_SAMPLE_SIZE); + }); + + it('returns SAMPLE_RATIO * totalCount for mid-size datasets', () => { + const result = computeEffectiveSampleSize(200_000); + expect(result).toBe(Math.ceil(200_000 * SAMPLE_RATIO)); + expect(result).toBeGreaterThan(MIN_SAMPLE_SIZE); + expect(result).toBeLessThan(MAX_SAMPLE_SIZE); + }); + + it('caps at MAX_SAMPLE_SIZE for very large datasets', () => { + expect(computeEffectiveSampleSize(10_000_000)).toBe(MAX_SAMPLE_SIZE); + }); + + it('returns exact 1% for datasets where 1% falls in the valid range', () => { + expect(computeEffectiveSampleSize(100_000)).toBe(1000); + }); +}); diff --git a/packages/app/src/components/deltaChartUtils.ts b/packages/app/src/components/deltaChartUtils.ts index cad5013bf..71f698b16 100644 --- a/packages/app/src/components/deltaChartUtils.ts +++ b/packages/app/src/components/deltaChartUtils.ts @@ -283,3 +283,36 @@ export function isHighCardinality( return effectiveUniqueness > 0.9; } + +// --------------------------------------------------------------------------- +// Sampling configuration +// --------------------------------------------------------------------------- + +// SAMPLE_SIZE: default number of rows sampled when the total count is unknown. +// MIN_SAMPLE_SIZE / MAX_SAMPLE_SIZE: bounds for adaptive sampling. +// SAMPLE_RATIO: fraction of total rows to sample (e.g., 0.01 = 1%). +// +// Adaptive formula: clamp(MIN, ceil(totalCount * SAMPLE_RATIO), MAX) +// Falls back to SAMPLE_SIZE when total count is not yet available. +// +// STABLE_SAMPLE_EXPR: ClickHouse expression used for ORDER BY in sample queries. +// 'cityHash64(SpanId)' gives deterministic ordering — same data always produces +// the same sample, so hover highlights are stable across re-renders. +// Set to 'rand()' to restore non-deterministic sampling. +export const SAMPLE_SIZE = 1000; +export const MIN_SAMPLE_SIZE = 500; +export const MAX_SAMPLE_SIZE = 5000; +export const SAMPLE_RATIO = 0.01; +export const STABLE_SAMPLE_EXPR = 'cityHash64(SpanId)'; + +/** + * Computes the effective sample size based on total row count. + * Returns SAMPLE_SIZE as fallback when totalCount is 0 or unavailable. + */ +export function computeEffectiveSampleSize(totalCount: number): number { + if (totalCount <= 0) return SAMPLE_SIZE; + return Math.min( + MAX_SAMPLE_SIZE, + Math.max(MIN_SAMPLE_SIZE, Math.ceil(totalCount * SAMPLE_RATIO)), + ); +} From 70023920ca046025f3087ddaf86103da695662f3 Mon Sep 17 00:00:00 2001 From: Alex Fedotyev Date: Wed, 4 Mar 2026 17:07:39 -0800 Subject: [PATCH 2/3] fix: remove unused computeEffectiveSampleSize import, clarify SpanId scope - Remove dead import of computeEffectiveSampleSize (adaptive sizing via count query will be added in a follow-up) - Add comment explaining STABLE_SAMPLE_EXPR uses SpanId because Event Deltas is currently trace-specific; should be parameterized if the feature expands to logs/metrics Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/app/src/components/DBDeltaChart.tsx | 1 - packages/app/src/components/deltaChartUtils.ts | 7 +++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/packages/app/src/components/DBDeltaChart.tsx b/packages/app/src/components/DBDeltaChart.tsx index b6699be8a..045dbc529 100644 --- a/packages/app/src/components/DBDeltaChart.tsx +++ b/packages/app/src/components/DBDeltaChart.tsx @@ -22,7 +22,6 @@ import { getFirstTimestampValueExpression } from '@/source'; import { SQLPreview } from './ChartSQLPreview'; import { - computeEffectiveSampleSize, getPropertyStatistics, isDenylisted, isHighCardinality, diff --git a/packages/app/src/components/deltaChartUtils.ts b/packages/app/src/components/deltaChartUtils.ts index 71f698b16..3f6c64b00 100644 --- a/packages/app/src/components/deltaChartUtils.ts +++ b/packages/app/src/components/deltaChartUtils.ts @@ -296,8 +296,11 @@ export function isHighCardinality( // Falls back to SAMPLE_SIZE when total count is not yet available. // // STABLE_SAMPLE_EXPR: ClickHouse expression used for ORDER BY in sample queries. -// 'cityHash64(SpanId)' gives deterministic ordering — same data always produces -// the same sample, so hover highlights are stable across re-renders. +// Deterministic ordering ensures the same data always produces the same sample, +// so hover highlights are stable across re-renders. +// Currently trace-specific (SpanId is always present on the traces search page +// where Event Deltas is rendered). If Event Deltas expands to logs/metrics, +// this should be parameterized per source. // Set to 'rand()' to restore non-deterministic sampling. export const SAMPLE_SIZE = 1000; export const MIN_SAMPLE_SIZE = 500; From 980ab65f549f2af95e31f0034b19ddf4d84a154e Mon Sep 17 00:00:00 2001 From: Alex Fedotyev Date: Thu, 5 Mar 2026 08:21:47 -0800 Subject: [PATCH 3/3] refactor: use source's spanIdExpression instead of hardcoded SpanId Address review feedback from @pulpdrew: - Replace hardcoded STABLE_SAMPLE_EXPR constant with getStableSampleExpression() that accepts the source's spanIdExpression, falling back to rand() - Pass spanIdExpression from source through DBSearchHeatmapChart -> DBDeltaChart - Apply JSDoc per-constant style for better editor hover docs - Add 4 tests for getStableSampleExpression (10 total) Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/app/src/components/DBDeltaChart.tsx | 13 ++++--- .../Search/DBSearchHeatmapChart.tsx | 1 + .../__tests__/deltaChartSampling.test.ts | 21 ++++++++++++ .../app/src/components/deltaChartUtils.ts | 34 +++++++++++-------- 4 files changed, 50 insertions(+), 19 deletions(-) diff --git a/packages/app/src/components/DBDeltaChart.tsx b/packages/app/src/components/DBDeltaChart.tsx index 045dbc529..d38d44ff9 100644 --- a/packages/app/src/components/DBDeltaChart.tsx +++ b/packages/app/src/components/DBDeltaChart.tsx @@ -23,11 +23,11 @@ import { getFirstTimestampValueExpression } from '@/source'; import { SQLPreview } from './ChartSQLPreview'; import { getPropertyStatistics, + getStableSampleExpression, isDenylisted, isHighCardinality, mergeValueStatisticsMaps, SAMPLE_SIZE, - STABLE_SAMPLE_EXPR, } from './deltaChartUtils'; import { CHART_GAP, @@ -44,6 +44,7 @@ export default function DBDeltaChart({ xMax, yMin, yMax, + spanIdExpression, }: { config: ChartConfigWithDateRange; valueExpr: string; @@ -51,10 +52,14 @@ export default function DBDeltaChart({ xMax: number; yMin: number; yMax: number; + spanIdExpression?: string; }) { // Determine if the value expression uses aggregate functions const isAggregate = isAggregateFunction(valueExpr); + // Build deterministic ORDER BY expression from source's spanIdExpression + const stableSampleExpr = getStableSampleExpression(spanIdExpression); + // Get the timestamp expression from config const timestampExpr = getFirstTimestampValueExpression( config.timestampValueExpression, @@ -138,7 +143,7 @@ export default function DBDeltaChart({ ] : []), ], - orderBy: [{ ordering: 'DESC', valueExpression: STABLE_SAMPLE_EXPR }], + orderBy: [{ ordering: 'DESC', valueExpression: stableSampleExpr }], limit: { limit: SAMPLE_SIZE }, }, }, @@ -193,7 +198,7 @@ export default function DBDeltaChart({ with: buildWithClauses(true), select: '*', filters: buildFilters(true), - orderBy: [{ ordering: 'DESC', valueExpression: STABLE_SAMPLE_EXPR }], + orderBy: [{ ordering: 'DESC', valueExpression: stableSampleExpr }], limit: { limit: SAMPLE_SIZE }, }); @@ -202,7 +207,7 @@ export default function DBDeltaChart({ with: buildWithClauses(false), select: '*', filters: buildFilters(false), - orderBy: [{ ordering: 'DESC', valueExpression: STABLE_SAMPLE_EXPR }], + orderBy: [{ ordering: 'DESC', valueExpression: stableSampleExpr }], limit: { limit: SAMPLE_SIZE }, }); diff --git a/packages/app/src/components/Search/DBSearchHeatmapChart.tsx b/packages/app/src/components/Search/DBSearchHeatmapChart.tsx index 1b710ea1c..fea4425cd 100644 --- a/packages/app/src/components/Search/DBSearchHeatmapChart.tsx +++ b/packages/app/src/components/Search/DBSearchHeatmapChart.tsx @@ -119,6 +119,7 @@ export function DBSearchHeatmapChart({ xMax={fields.xMax} yMin={fields.yMin} yMax={fields.yMax} + spanIdExpression={source.spanIdExpression} /> ) : (
diff --git a/packages/app/src/components/__tests__/deltaChartSampling.test.ts b/packages/app/src/components/__tests__/deltaChartSampling.test.ts index 1a805b920..7403ec0eb 100644 --- a/packages/app/src/components/__tests__/deltaChartSampling.test.ts +++ b/packages/app/src/components/__tests__/deltaChartSampling.test.ts @@ -1,11 +1,32 @@ import { computeEffectiveSampleSize, + getStableSampleExpression, MAX_SAMPLE_SIZE, MIN_SAMPLE_SIZE, SAMPLE_RATIO, SAMPLE_SIZE, } from '../deltaChartUtils'; +describe('getStableSampleExpression', () => { + it('returns cityHash64 of spanIdExpression when provided', () => { + expect(getStableSampleExpression('SpanId')).toBe('cityHash64(SpanId)'); + }); + + it('uses custom spanId column name', () => { + expect(getStableSampleExpression('my_span_id')).toBe( + 'cityHash64(my_span_id)', + ); + }); + + it('falls back to rand() when spanIdExpression is undefined', () => { + expect(getStableSampleExpression(undefined)).toBe('rand()'); + }); + + it('falls back to rand() when spanIdExpression is empty', () => { + expect(getStableSampleExpression('')).toBe('rand()'); + }); +}); + describe('computeEffectiveSampleSize', () => { it('returns SAMPLE_SIZE when totalCount is 0 (fallback)', () => { expect(computeEffectiveSampleSize(0)).toBe(SAMPLE_SIZE); diff --git a/packages/app/src/components/deltaChartUtils.ts b/packages/app/src/components/deltaChartUtils.ts index 3f6c64b00..1fe08c793 100644 --- a/packages/app/src/components/deltaChartUtils.ts +++ b/packages/app/src/components/deltaChartUtils.ts @@ -288,28 +288,32 @@ export function isHighCardinality( // Sampling configuration // --------------------------------------------------------------------------- -// SAMPLE_SIZE: default number of rows sampled when the total count is unknown. -// MIN_SAMPLE_SIZE / MAX_SAMPLE_SIZE: bounds for adaptive sampling. -// SAMPLE_RATIO: fraction of total rows to sample (e.g., 0.01 = 1%). -// -// Adaptive formula: clamp(MIN, ceil(totalCount * SAMPLE_RATIO), MAX) -// Falls back to SAMPLE_SIZE when total count is not yet available. -// -// STABLE_SAMPLE_EXPR: ClickHouse expression used for ORDER BY in sample queries. -// Deterministic ordering ensures the same data always produces the same sample, -// so hover highlights are stable across re-renders. -// Currently trace-specific (SpanId is always present on the traces search page -// where Event Deltas is rendered). If Event Deltas expands to logs/metrics, -// this should be parameterized per source. -// Set to 'rand()' to restore non-deterministic sampling. +/** Default number of rows sampled when the total count is unknown */ export const SAMPLE_SIZE = 1000; + +/** Minimum number of rows to sample */ export const MIN_SAMPLE_SIZE = 500; + +/** Maximum number of rows to sample */ export const MAX_SAMPLE_SIZE = 5000; + +/** Fraction of total rows to sample (e.g., 0.01 = 1%) */ export const SAMPLE_RATIO = 0.01; -export const STABLE_SAMPLE_EXPR = 'cityHash64(SpanId)'; + +/** + * Builds a deterministic ORDER BY expression for stable sampling. + * Uses the source's spanIdExpression when available, falls back to rand(). + */ +export function getStableSampleExpression(spanIdExpression?: string): string { + if (spanIdExpression) { + return `cityHash64(${spanIdExpression})`; + } + return 'rand()'; +} /** * Computes the effective sample size based on total row count. + * Adaptive formula: clamp(MIN_SAMPLE_SIZE, ceil(totalCount * SAMPLE_RATIO), MAX_SAMPLE_SIZE). * Returns SAMPLE_SIZE as fallback when totalCount is 0 or unavailable. */ export function computeEffectiveSampleSize(totalCount: number): number {