diff --git a/.changeset/sampling-improvements.md b/.changeset/sampling-improvements.md new file mode 100644 index 000000000..304caaf40 --- /dev/null +++ b/.changeset/sampling-improvements.md @@ -0,0 +1,5 @@ +--- +"@hyperdx/app": patch +--- + +feat: deterministic sampling with adaptive sample size for Event Deltas diff --git a/packages/app/src/components/DBDeltaChart.tsx b/packages/app/src/components/DBDeltaChart.tsx index fd5bd68de..d38d44ff9 100644 --- a/packages/app/src/components/DBDeltaChart.tsx +++ b/packages/app/src/components/DBDeltaChart.tsx @@ -23,9 +23,11 @@ import { getFirstTimestampValueExpression } from '@/source'; import { SQLPreview } from './ChartSQLPreview'; import { getPropertyStatistics, + getStableSampleExpression, isDenylisted, isHighCardinality, mergeValueStatisticsMaps, + SAMPLE_SIZE, } from './deltaChartUtils'; import { CHART_GAP, @@ -42,6 +44,7 @@ export default function DBDeltaChart({ xMax, yMin, yMax, + spanIdExpression, }: { config: ChartConfigWithDateRange; valueExpr: string; @@ -49,10 +52,14 @@ export default function DBDeltaChart({ xMax: number; yMin: number; yMax: number; + spanIdExpression?: string; }) { // Determine if the value expression uses aggregate functions const isAggregate = isAggregateFunction(valueExpr); + // Build deterministic ORDER BY expression from source's spanIdExpression + const stableSampleExpr = getStableSampleExpression(spanIdExpression); + // Get the timestamp expression from config const timestampExpr = getFirstTimestampValueExpression( config.timestampValueExpression, @@ -136,8 +143,8 @@ export default function DBDeltaChart({ ] : []), ], - orderBy: [{ ordering: 'DESC', valueExpression: 'rand()' }], - limit: { limit: 1000 }, + orderBy: [{ ordering: 'DESC', valueExpression: stableSampleExpr }], + limit: { limit: SAMPLE_SIZE }, }, }, ]; @@ -191,8 +198,8 @@ export default function DBDeltaChart({ with: buildWithClauses(true), select: '*', filters: buildFilters(true), - orderBy: [{ ordering: 'DESC', valueExpression: 'rand()' }], - limit: { limit: 1000 }, + orderBy: [{ ordering: 'DESC', valueExpression: stableSampleExpr }], + limit: { limit: SAMPLE_SIZE }, }); const { data: inlierData } = useQueriedChartConfig({ @@ -200,8 +207,8 @@ export default function DBDeltaChart({ with: buildWithClauses(false), select: '*', filters: buildFilters(false), - orderBy: [{ ordering: 'DESC', valueExpression: 'rand()' }], - limit: { limit: 1000 }, + orderBy: [{ ordering: 'DESC', valueExpression: stableSampleExpr }], + limit: { limit: SAMPLE_SIZE }, }); // Column metadata for field classification (from ClickHouse response) diff --git a/packages/app/src/components/Search/DBSearchHeatmapChart.tsx b/packages/app/src/components/Search/DBSearchHeatmapChart.tsx index 1b710ea1c..fea4425cd 100644 --- a/packages/app/src/components/Search/DBSearchHeatmapChart.tsx +++ b/packages/app/src/components/Search/DBSearchHeatmapChart.tsx @@ -119,6 +119,7 @@ export function DBSearchHeatmapChart({ xMax={fields.xMax} yMin={fields.yMin} yMax={fields.yMax} + spanIdExpression={source.spanIdExpression} /> ) : (
diff --git a/packages/app/src/components/__tests__/deltaChartSampling.test.ts b/packages/app/src/components/__tests__/deltaChartSampling.test.ts new file mode 100644 index 000000000..7403ec0eb --- /dev/null +++ b/packages/app/src/components/__tests__/deltaChartSampling.test.ts @@ -0,0 +1,57 @@ +import { + computeEffectiveSampleSize, + getStableSampleExpression, + MAX_SAMPLE_SIZE, + MIN_SAMPLE_SIZE, + SAMPLE_RATIO, + SAMPLE_SIZE, +} from '../deltaChartUtils'; + +describe('getStableSampleExpression', () => { + it('returns cityHash64 of spanIdExpression when provided', () => { + expect(getStableSampleExpression('SpanId')).toBe('cityHash64(SpanId)'); + }); + + it('uses custom spanId column name', () => { + expect(getStableSampleExpression('my_span_id')).toBe( + 'cityHash64(my_span_id)', + ); + }); + + it('falls back to rand() when spanIdExpression is undefined', () => { + expect(getStableSampleExpression(undefined)).toBe('rand()'); + }); + + it('falls back to rand() when spanIdExpression is empty', () => { + expect(getStableSampleExpression('')).toBe('rand()'); + }); +}); + +describe('computeEffectiveSampleSize', () => { + it('returns SAMPLE_SIZE when totalCount is 0 (fallback)', () => { + expect(computeEffectiveSampleSize(0)).toBe(SAMPLE_SIZE); + }); + + it('returns SAMPLE_SIZE when totalCount is negative', () => { + expect(computeEffectiveSampleSize(-1)).toBe(SAMPLE_SIZE); + }); + + it('returns MIN_SAMPLE_SIZE for small datasets', () => { + expect(computeEffectiveSampleSize(100)).toBe(MIN_SAMPLE_SIZE); + }); + + it('returns SAMPLE_RATIO * totalCount for mid-size datasets', () => { + const result = computeEffectiveSampleSize(200_000); + expect(result).toBe(Math.ceil(200_000 * SAMPLE_RATIO)); + expect(result).toBeGreaterThan(MIN_SAMPLE_SIZE); + expect(result).toBeLessThan(MAX_SAMPLE_SIZE); + }); + + it('caps at MAX_SAMPLE_SIZE for very large datasets', () => { + expect(computeEffectiveSampleSize(10_000_000)).toBe(MAX_SAMPLE_SIZE); + }); + + it('returns exact 1% for datasets where 1% falls in the valid range', () => { + expect(computeEffectiveSampleSize(100_000)).toBe(1000); + }); +}); diff --git a/packages/app/src/components/deltaChartUtils.ts b/packages/app/src/components/deltaChartUtils.ts index cad5013bf..1fe08c793 100644 --- a/packages/app/src/components/deltaChartUtils.ts +++ b/packages/app/src/components/deltaChartUtils.ts @@ -283,3 +283,43 @@ export function isHighCardinality( return effectiveUniqueness > 0.9; } + +// --------------------------------------------------------------------------- +// Sampling configuration +// --------------------------------------------------------------------------- + +/** Default number of rows sampled when the total count is unknown */ +export const SAMPLE_SIZE = 1000; + +/** Minimum number of rows to sample */ +export const MIN_SAMPLE_SIZE = 500; + +/** Maximum number of rows to sample */ +export const MAX_SAMPLE_SIZE = 5000; + +/** Fraction of total rows to sample (e.g., 0.01 = 1%) */ +export const SAMPLE_RATIO = 0.01; + +/** + * Builds a deterministic ORDER BY expression for stable sampling. + * Uses the source's spanIdExpression when available, falls back to rand(). + */ +export function getStableSampleExpression(spanIdExpression?: string): string { + if (spanIdExpression) { + return `cityHash64(${spanIdExpression})`; + } + return 'rand()'; +} + +/** + * Computes the effective sample size based on total row count. + * Adaptive formula: clamp(MIN_SAMPLE_SIZE, ceil(totalCount * SAMPLE_RATIO), MAX_SAMPLE_SIZE). + * Returns SAMPLE_SIZE as fallback when totalCount is 0 or unavailable. + */ +export function computeEffectiveSampleSize(totalCount: number): number { + if (totalCount <= 0) return SAMPLE_SIZE; + return Math.min( + MAX_SAMPLE_SIZE, + Math.max(MIN_SAMPLE_SIZE, Math.ceil(totalCount * SAMPLE_RATIO)), + ); +}