From 7ce3e3e4743bccd8408614e56a523a2d65f23483 Mon Sep 17 00:00:00 2001 From: "zhou.cai" Date: Mon, 9 Mar 2026 18:41:26 +0800 Subject: [PATCH 01/26] fix topru proto --- proto/tidb.proto | 6 +- src/sources/topsql/upstream/tidb/parser.rs | 2 +- src/sources/topsql/upstream/tidb/proto.rs | 4 +- src/sources/topsql_v2/upstream/tidb/parser.rs | 124 ++++++++---------- src/sources/topsql_v2/upstream/tidb/proto.rs | 4 +- 5 files changed, 62 insertions(+), 78 deletions(-) diff --git a/proto/tidb.proto b/proto/tidb.proto index 1dec653..cacd2da 100644 --- a/proto/tidb.proto +++ b/proto/tidb.proto @@ -62,10 +62,6 @@ message TopRURecord { repeated TopRURecordItem items = 5; } -message ReportTopRURecords { - repeated TopRURecord records = 1; -} - // TopRURecordItem represents statistics within a single time bucket. message TopRURecordItem { uint64 timestamp_sec = 1; // timestamp in second @@ -117,6 +113,6 @@ message TopSQLSubResponse { TopSQLRecord record = 1; SQLMeta sql_meta = 2; PlanMeta plan_meta = 3; - ReportTopRURecords top_ru_records = 4; + TopRURecord ru_record = 4; } } diff --git a/src/sources/topsql/upstream/tidb/parser.rs b/src/sources/topsql/upstream/tidb/parser.rs index e201ef2..ec61098 100644 --- a/src/sources/topsql/upstream/tidb/parser.rs +++ b/src/sources/topsql/upstream/tidb/parser.rs @@ -37,7 +37,7 @@ impl UpstreamEventParser for TopSqlSubResponseParser { } Some(RespOneof::SqlMeta(sql_meta)) => Self::parse_tidb_sql_meta(sql_meta), Some(RespOneof::PlanMeta(plan_meta)) => Self::parse_tidb_plan_meta(plan_meta), - Some(RespOneof::TopRuRecords(_)) => vec![], // TODO: implement TopRURecords parsing + Some(RespOneof::RuRecord(_)) => vec![], // TODO: implement TopRU record parsing None => vec![], } } diff --git a/src/sources/topsql/upstream/tidb/proto.rs b/src/sources/topsql/upstream/tidb/proto.rs index 345584b..48c1e10 100644 --- a/src/sources/topsql/upstream/tidb/proto.rs +++ b/src/sources/topsql/upstream/tidb/proto.rs @@ -24,9 +24,7 @@ impl ByteSizeOf for RespOneof { RespOneof::PlanMeta(plan_meta) => { plan_meta.plan_digest.len() + plan_meta.normalized_plan.len() } - RespOneof::TopRuRecords(top_ru_records) => { - top_ru_records.records.size_of() - } + RespOneof::RuRecord(ru_record) => ru_record.size_of(), } } } diff --git a/src/sources/topsql_v2/upstream/tidb/parser.rs b/src/sources/topsql_v2/upstream/tidb/parser.rs index 5c35c03..50d1aba 100644 --- a/src/sources/topsql_v2/upstream/tidb/parser.rs +++ b/src/sources/topsql_v2/upstream/tidb/parser.rs @@ -36,7 +36,7 @@ impl UpstreamEventParser for TopSqlSubResponseParser { } Some(RespOneof::SqlMeta(sql_meta)) => Self::parse_tidb_sql_meta(sql_meta), Some(RespOneof::PlanMeta(plan_meta)) => Self::parse_tidb_plan_meta(plan_meta), - Some(RespOneof::TopRuRecords(top_ru_records)) => Self::parse_top_ru_records(top_ru_records), + Some(RespOneof::RuRecord(ru_record)) => Self::parse_top_ru_record(ru_record), None => vec![], } } @@ -320,52 +320,48 @@ impl TopSqlSubResponseParser { events } - fn parse_top_ru_records(top_ru_records: crate::sources::topsql_v2::upstream::tidb::proto::ReportTopRuRecords) -> Vec { + fn parse_top_ru_record(record: crate::sources::topsql_v2::upstream::tidb::proto::TopRuRecord) -> Vec { let mut events = vec![]; let mut date = String::new(); - - for record in top_ru_records.records { - let mut keyspace_name_str = "".to_string(); - if !record.keyspace_name.is_empty() { - if let Ok(ks) = String::from_utf8(record.keyspace_name.clone()) { - keyspace_name_str = ks; - } + + let mut keyspace_name_str = "".to_string(); + if !record.keyspace_name.is_empty() { + if let Ok(ks) = String::from_utf8(record.keyspace_name.clone()) { + keyspace_name_str = ks; } - - for item in record.items { - let mut event = Event::Log(LogEvent::default()); - let log = event.as_mut_log(); + } - // Add metadata with Vector prefix - log.insert(LABEL_SOURCE_TABLE, SOURCE_TABLE_TOPRU); - log.insert(LABEL_TIMESTAMPS, LogValue::from(item.timestamp_sec)); - - if date.is_empty() { - date = chrono::DateTime::from_timestamp(item.timestamp_sec as i64, 0) - .map(|dt| dt.format("%Y-%m-%d").to_string()) - .unwrap_or_else(|| "1970-01-01".to_string()); - } - log.insert(LABEL_DATE, LogValue::from(date.clone())); - - // Note: TopRU doesn't use instance_key - all instances write to same table - if !keyspace_name_str.is_empty() { - log.insert(LABEL_KEYSPACE, keyspace_name_str.clone()); - } - log.insert(LABEL_USER, record.user.clone()); - log.insert( - LABEL_SQL_DIGEST, - hex::encode_upper(record.sql_digest.clone()), - ); - log.insert( - LABEL_PLAN_DIGEST, - hex::encode_upper(record.plan_digest.clone()), - ); - log.insert(METRIC_NAME_TOTAL_RU, LogValue::from(item.total_ru)); - log.insert(METRIC_NAME_EXEC_COUNT, LogValue::from(item.exec_count)); - log.insert(METRIC_NAME_EXEC_DURATION, LogValue::from(item.exec_duration)); - - events.push(event.into_log()); + for item in record.items { + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + + log.insert(LABEL_SOURCE_TABLE, SOURCE_TABLE_TOPRU); + log.insert(LABEL_TIMESTAMPS, LogValue::from(item.timestamp_sec)); + + if date.is_empty() { + date = chrono::DateTime::from_timestamp(item.timestamp_sec as i64, 0) + .map(|dt| dt.format("%Y-%m-%d").to_string()) + .unwrap_or_else(|| "1970-01-01".to_string()); + } + log.insert(LABEL_DATE, LogValue::from(date.clone())); + + if !keyspace_name_str.is_empty() { + log.insert(LABEL_KEYSPACE, keyspace_name_str.clone()); } + log.insert(LABEL_USER, record.user.clone()); + log.insert( + LABEL_SQL_DIGEST, + hex::encode_upper(record.sql_digest.clone()), + ); + log.insert( + LABEL_PLAN_DIGEST, + hex::encode_upper(record.plan_digest.clone()), + ); + log.insert(METRIC_NAME_TOTAL_RU, LogValue::from(item.total_ru)); + log.insert(METRIC_NAME_EXEC_COUNT, LogValue::from(item.exec_count)); + log.insert(METRIC_NAME_EXEC_DURATION, LogValue::from(item.exec_duration)); + + events.push(event.into_log()); } events } @@ -374,7 +370,7 @@ impl TopSqlSubResponseParser { #[cfg(test)] mod tests { use super::*; - use crate::sources::topsql_v2::upstream::tidb::proto::{TopSqlRecordItem, TopRuRecord, TopRuRecordItem, ReportTopRuRecords}; + use crate::sources::topsql_v2::upstream::tidb::proto::{TopSqlRecordItem, TopRuRecord, TopRuRecordItem}; const MOCK_RECORDS: &'static str = include_str!("testdata/mock-records.json"); @@ -885,33 +881,29 @@ mod tests { } #[test] - fn test_parse_top_ru_records() { - let top_ru_records = ReportTopRuRecords { - records: vec![ - TopRuRecord { - keyspace_name: b"test_keyspace".to_vec(), - user: "test_user".to_string(), - sql_digest: b"sql_digest_123".to_vec(), - plan_digest: b"plan_digest_456".to_vec(), - items: vec![ - TopRuRecordItem { - timestamp_sec: 1709646900, - total_ru: 100.5, - exec_count: 10, - exec_duration: 50000000, // 50ms in nanoseconds - }, - TopRuRecordItem { - timestamp_sec: 1709646960, - total_ru: 200.0, - exec_count: 20, - exec_duration: 100000000, // 100ms in nanoseconds - }, - ], + fn test_parse_top_ru_record() { + let ru_record = TopRuRecord { + keyspace_name: b"test_keyspace".to_vec(), + user: "test_user".to_string(), + sql_digest: b"sql_digest_123".to_vec(), + plan_digest: b"plan_digest_456".to_vec(), + items: vec![ + TopRuRecordItem { + timestamp_sec: 1709646900, + total_ru: 100.5, + exec_count: 10, + exec_duration: 50000000, // 50ms in nanoseconds + }, + TopRuRecordItem { + timestamp_sec: 1709646960, + total_ru: 200.0, + exec_count: 20, + exec_duration: 100000000, // 100ms in nanoseconds }, ], }; - let events = TopSqlSubResponseParser::parse_top_ru_records(top_ru_records); + let events = TopSqlSubResponseParser::parse_top_ru_record(ru_record); assert_eq!(events.len(), 2); // Check first event diff --git a/src/sources/topsql_v2/upstream/tidb/proto.rs b/src/sources/topsql_v2/upstream/tidb/proto.rs index 345584b..48c1e10 100644 --- a/src/sources/topsql_v2/upstream/tidb/proto.rs +++ b/src/sources/topsql_v2/upstream/tidb/proto.rs @@ -24,9 +24,7 @@ impl ByteSizeOf for RespOneof { RespOneof::PlanMeta(plan_meta) => { plan_meta.plan_digest.len() + plan_meta.normalized_plan.len() } - RespOneof::TopRuRecords(top_ru_records) => { - top_ru_records.records.size_of() - } + RespOneof::RuRecord(ru_record) => ru_record.size_of(), } } } From 387215d3f603a8b00096992cb06097bef7461f84 Mon Sep 17 00:00:00 2001 From: "zhou.cai" Date: Mon, 16 Mar 2026 13:10:45 +0800 Subject: [PATCH 02/26] change type to component --- src/sinks/topsql_data_deltalake/processor.rs | 4 ++-- src/sinks/topsql_meta_deltalake/processor.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/sinks/topsql_data_deltalake/processor.rs b/src/sinks/topsql_data_deltalake/processor.rs index 173eb50..2620411 100644 --- a/src/sinks/topsql_data_deltalake/processor.rs +++ b/src/sinks/topsql_data_deltalake/processor.rs @@ -456,12 +456,12 @@ impl TopSQLDeltaLakeSink { } }; - let type_dir = format!("type=topsql_{}", table_type); + let type_dir = format!("component={}", table_type); let instance_dir = format!("instance={}", table_instance); let table_path = if self.base_path.to_string_lossy().starts_with("s3://") { // For S3 paths, build a partition-like directory structure - // /topsql/data/type=.../instance=.../ + // /topsql/data/component=.../instance=.../ let base = self.base_path.to_string_lossy(); let base = base.trim_end_matches('/'); PathBuf::from(format!( diff --git a/src/sinks/topsql_meta_deltalake/processor.rs b/src/sinks/topsql_meta_deltalake/processor.rs index beae8dc..f14ec1a 100644 --- a/src/sinks/topsql_meta_deltalake/processor.rs +++ b/src/sinks/topsql_meta_deltalake/processor.rs @@ -410,13 +410,13 @@ impl TopSQLDeltaLakeSink { let table_path = if self.base_path.to_string_lossy().starts_with("s3://") { // For S3 paths, append the table name to the S3 path PathBuf::from(format!( - "{}/type={}", + "{}/component={}", self.base_path.to_string_lossy(), table_name )) } else { // For local paths, use join as before - self.base_path.join(format!("type={}", table_name)) + self.base_path.join(format!("component={}", table_name)) }; let table_config = self From 6aaf6afac6fc03c7cce3e60261bb5e383222721b Mon Sep 17 00:00:00 2001 From: "zhou.cai" Date: Wed, 18 Mar 2026 19:37:17 +0800 Subject: [PATCH 03/26] fix native jeprof issue --- src/sources/conprof/tools/jeprof_native.rs | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/sources/conprof/tools/jeprof_native.rs b/src/sources/conprof/tools/jeprof_native.rs index ff81f81..e5e4a2f 100644 --- a/src/sources/conprof/tools/jeprof_native.rs +++ b/src/sources/conprof/tools/jeprof_native.rs @@ -187,14 +187,16 @@ fn build_symbolized_output( out.extend_from_slice(program_name.as_bytes()); out.push(b'\n'); for pc in pcs { - let sym = symbol_map - .get(pc) - .map(|s| s.as_str()) - .unwrap_or("0x"); out.extend_from_slice(b"0x"); out.extend_from_slice(pc.as_bytes()); out.push(b' '); - out.extend_from_slice(sym.as_bytes()); + if let Some(s) = symbol_map.get(pc) { + out.extend_from_slice(s.as_bytes()); + } else { + // Match Perl: when symbol is missing, use address as the symbol (0x) + out.extend_from_slice(b"0x"); + out.extend_from_slice(pc.as_bytes()); + } out.push(b'\n'); } out.extend_from_slice(b"---\n"); @@ -205,12 +207,15 @@ fn build_symbolized_output( /// Full native jeprof --raw flow: GET heap -> parse PCs -> fetch symbols + cmdline -> build output. /// If profile is binary or parsing yields no PCs, returns raw body only (no symbol header). +/// Requesting Accept: text/plain ensures the server returns pprof text format (e.g. "heap profile: ... @ 0x...") +/// so we can parse PCs and add the symbol header; otherwise some servers return jemalloc heap_v2 and we skip symbolization. pub async fn fetch_raw_symbolized( client: &Client, heap_url: &str, ) -> Result, String> { let body = client .get(heap_url) + .header("Accept", "text/plain") .send() .await .map_err(|e| format!("http request failed: {}", e))?; From 8667b91ba6b4fb49f5b3d873ff144e16876191be Mon Sep 17 00:00:00 2001 From: "zhou.cai" Date: Thu, 19 Mar 2026 10:24:57 +0800 Subject: [PATCH 04/26] remove arm vector first --- .github/workflows/build_image.yml | 76 +++++++++++++++---------------- Makefile | 4 +- scripts/release-docker.sh | 6 +-- 3 files changed, 43 insertions(+), 43 deletions(-) diff --git a/.github/workflows/build_image.yml b/.github/workflows/build_image.yml index fc6538a..8619ada 100644 --- a/.github/workflows/build_image.yml +++ b/.github/workflows/build_image.yml @@ -112,20 +112,20 @@ jobs: find target/aarch64-unknown-linux-gnu/release/deps -name "*.rlib" -not -name "libvector*.rlib" -delete 2>/dev/null || true find target/aarch64-unknown-linux-gnu/release/build -type f -name "*.o" -delete 2>/dev/null || true - - name: Build armv7 binary (standard) - timeout-minutes: 90 - env: - CARGO_PROFILE_RELEASE_CODEGEN_UNITS: 16 - CARGO_PROFILE_RELEASE_LTO: "thin" - CARGO_BUILD_JOBS: 4 - CARGO_INCREMENTAL: 0 - run: | - echo "Starting armv7 build at $(date)" - make build-armv7-unknown-linux-gnueabihf - echo "Finished armv7 build at $(date)" - # Clean up intermediate files to save disk space - find target/armv7-unknown-linux-gnueabihf/release/deps -name "*.rlib" -not -name "libvector*.rlib" -delete 2>/dev/null || true - find target/armv7-unknown-linux-gnueabihf/release/build -type f -name "*.o" -delete 2>/dev/null || true + # - name: Build armv7 binary (standard) + # timeout-minutes: 90 + # env: + # CARGO_PROFILE_RELEASE_CODEGEN_UNITS: 16 + # CARGO_PROFILE_RELEASE_LTO: "thin" + # CARGO_BUILD_JOBS: 4 + # CARGO_INCREMENTAL: 0 + # run: | + # echo "Starting armv7 build at $(date)" + # make build-armv7-unknown-linux-gnueabihf + # echo "Finished armv7 build at $(date)" + # # Clean up intermediate files to save disk space + # find target/armv7-unknown-linux-gnueabihf/release/deps -name "*.rlib" -not -name "libvector*.rlib" -delete 2>/dev/null || true + # find target/armv7-unknown-linux-gnueabihf/release/build -type f -name "*.o" -delete 2>/dev/null || true - name: Build and push standard image env: @@ -177,20 +177,20 @@ jobs: find target/aarch64-unknown-linux-gnu/release/deps -name "*.rlib" -not -name "libvector*.rlib" -delete 2>/dev/null || true find target/aarch64-unknown-linux-gnu/release/build -type f -name "*.o" -delete 2>/dev/null || true - - name: Build armv7 binary (nextgen) - timeout-minutes: 90 - env: - CARGO_PROFILE_RELEASE_CODEGEN_UNITS: 16 - CARGO_PROFILE_RELEASE_LTO: "thin" - CARGO_BUILD_JOBS: 4 - CARGO_INCREMENTAL: 0 - run: | - echo "Starting armv7 nextgen build at $(date)" - make build-armv7-unknown-linux-gnueabihf-nextgen - echo "Finished armv7 nextgen build at $(date)" - # Clean up intermediate files to save disk space - find target/armv7-unknown-linux-gnueabihf/release/deps -name "*.rlib" -not -name "libvector*.rlib" -delete 2>/dev/null || true - find target/armv7-unknown-linux-gnueabihf/release/build -type f -name "*.o" -delete 2>/dev/null || true + # - name: Build armv7 binary (nextgen) + # timeout-minutes: 90 + # env: + # CARGO_PROFILE_RELEASE_CODEGEN_UNITS: 16 + # CARGO_PROFILE_RELEASE_LTO: "thin" + # CARGO_BUILD_JOBS: 4 + # CARGO_INCREMENTAL: 0 + # run: | + # echo "Starting armv7 nextgen build at $(date)" + # make build-armv7-unknown-linux-gnueabihf-nextgen + # echo "Finished armv7 nextgen build at $(date)" + # # Clean up intermediate files to save disk space + # find target/armv7-unknown-linux-gnueabihf/release/deps -name "*.rlib" -not -name "libvector*.rlib" -delete 2>/dev/null || true + # find target/armv7-unknown-linux-gnueabihf/release/build -type f -name "*.o" -delete 2>/dev/null || true - name: Check nextgen binaries before building image run: | @@ -211,18 +211,18 @@ jobs: else echo " ❌ NOT FOUND" fi - echo "" - echo "armv7 binary:" - if [ -f target/armv7-unknown-linux-gnueabihf/release/vector-nextgen ]; then - ls -lh target/armv7-unknown-linux-gnueabihf/release/vector-nextgen - echo " ✅ EXISTS" - else - echo " ❌ NOT FOUND" - fi + # echo "" + # echo "armv7 binary:" + # if [ -f target/armv7-unknown-linux-gnueabihf/release/vector-nextgen ]; then + # ls -lh target/armv7-unknown-linux-gnueabihf/release/vector-nextgen + # echo " ✅ EXISTS" + # else + # echo " ❌ NOT FOUND" + # fi echo "" if [ -f target/x86_64-unknown-linux-gnu/release/vector-nextgen ] && \ - [ -f target/aarch64-unknown-linux-gnu/release/vector-nextgen ] && \ - [ -f target/armv7-unknown-linux-gnueabihf/release/vector-nextgen ]; then + [ -f target/aarch64-unknown-linux-gnu/release/vector-nextgen ]; then + # [ -f target/armv7-unknown-linux-gnueabihf/release/vector-nextgen ]; then echo "✅ All nextgen binaries exist - Makefile should skip rebuild" else echo "⚠️ Some binaries missing - Makefile will trigger rebuild" diff --git a/Makefile b/Makefile index 95219c6..733b2f0 100644 --- a/Makefile +++ b/Makefile @@ -186,7 +186,7 @@ cargo-install-%: .PHONY: release-docker release-docker: target/x86_64-unknown-linux-gnu/release/vector release-docker: target/aarch64-unknown-linux-gnu/release/vector -release-docker: target/armv7-unknown-linux-gnueabihf/release/vector +#release-docker: target/armv7-unknown-linux-gnueabihf/release/vector @echo "Releasing docker image..." @scripts/release-docker.sh @echo "Done releasing docker image." @@ -194,7 +194,7 @@ release-docker: target/armv7-unknown-linux-gnueabihf/release/vector .PHONY: release-docker-nextgen release-docker-nextgen: target/x86_64-unknown-linux-gnu/release/vector-nextgen release-docker-nextgen: target/aarch64-unknown-linux-gnu/release/vector-nextgen -release-docker-nextgen: target/armv7-unknown-linux-gnueabihf/release/vector-nextgen +# release-docker-nextgen: target/armv7-unknown-linux-gnueabihf/release/vector-nextgen @echo "Releasing docker image (nextgen mode)..." @NEXTGEN=true scripts/release-docker.sh @echo "Done releasing docker image (nextgen mode)." diff --git a/scripts/release-docker.sh b/scripts/release-docker.sh index 16606df..c924e55 100755 --- a/scripts/release-docker.sh +++ b/scripts/release-docker.sh @@ -35,7 +35,7 @@ BINARY_NAME="${NEXTGEN:+vector-nextgen}" BINARY_NAME="${BINARY_NAME:-vector}" cp target/x86_64-unknown-linux-gnu/release/${BINARY_NAME} "$WORK_DIR"/vector-amd64 cp target/aarch64-unknown-linux-gnu/release/${BINARY_NAME} "$WORK_DIR"/vector-arm64 -cp target/armv7-unknown-linux-gnueabihf/release/${BINARY_NAME} "$WORK_DIR"/vector-arm +# cp target/armv7-unknown-linux-gnueabihf/release/${BINARY_NAME} "$WORK_DIR"/vector-arm # cp config/vector.toml "$WORK_DIR" VERSION="${VECTOR_VERSION:-"$(scripts/version.sh)"}" @@ -45,7 +45,7 @@ BASE=debian TAG="${TAG:-$REPO:$VERSION-$BASE}" DOCKERFILE="scripts/docker/Dockerfile" -PLATFORMS="linux/amd64,linux/arm64,linux/arm/v7" -#PLATFORMS="linux/amd64,linux/arm64" +#PLATFORMS="linux/amd64,linux/arm64,linux/arm/v7" +PLATFORMS="linux/amd64,linux/arm64" echo "Building docker image: $TAG for $PLATFORMS" docker buildx build --push --platform="$PLATFORMS" -t "$TAG" -f "$DOCKERFILE" "$WORK_DIR" From ddbe3a11e670e5dc3a51d24926247169508eedb1 Mon Sep 17 00:00:00 2001 From: "zhou.cai" Date: Thu, 19 Mar 2026 11:01:24 +0800 Subject: [PATCH 05/26] native jeprof support heap_v2 --- src/sources/conprof/tools/jeprof_native.rs | 92 +++++++++++++++++----- 1 file changed, 71 insertions(+), 21 deletions(-) diff --git a/src/sources/conprof/tools/jeprof_native.rs b/src/sources/conprof/tools/jeprof_native.rs index e5e4a2f..c10663c 100644 --- a/src/sources/conprof/tools/jeprof_native.rs +++ b/src/sources/conprof/tools/jeprof_native.rs @@ -41,10 +41,16 @@ fn address_sub_one(hex_addr: &str) -> Option { Some(format!("{:0width$x}", r, width = ADDRESS_LENGTH)) } -/// Parse pprof heap profile text format and collect unique PCs (call sites). -/// Lines: optional % commands, then header "heap profile: ...", then -/// "\s*(\d+):\s*(\d+)\s*\[\s*(\d+):\s*(\d+)\]\s*@\s*(.*)" with addresses after @. -/// FixCallerAddresses: subtract 1 from each address except the first. +/// Parse pprof heap text format and collect unique PCs (call sites). +/// +/// Supports both: +/// 1) Heap profile entries: +/// "\s*(\d+):\s*(\d+)\s*\[\s*(\d+):\s*(\d+)\]\s*@\s*(addr1 addr2 ...)" +/// 2) Remote threaded heap v2 ("heap_v2/"): +/// - first section header: "heap_v2/" +/// - stack is provided on separate lines starting with "@ addr1 addr2 ..." +/// +/// FixCallerAddresses (jeprof): subtract 1 from each address except the first. /// Returns sorted unique PCs as 0-padded hex strings (no 0x prefix, for consistent ordering). fn parse_heap_profile_for_pcs(body: &[u8]) -> Option> { let text = str::from_utf8(body).ok()?; @@ -60,7 +66,10 @@ fn parse_heap_profile_for_pcs(body: &[u8]) -> Option> { continue; } if !past_header { - if line.starts_with("heap profile:") || line.starts_with("heap ") { + if line.starts_with("heap profile:") + || line.starts_with("heap ") + || line.starts_with("heap_v2/") + { past_header = true; } continue; @@ -68,25 +77,52 @@ fn parse_heap_profile_for_pcs(body: &[u8]) -> Option> { if line.starts_with("MAPPED_LIBRARIES:") || line.starts_with("--- Memory map:") { break; } - // Match: optional whitespace, count1: bytes1 [ count2: bytes2 ] @ addr1 addr2 ... + let rest = line.trim_start(); - let at_pos = rest.find(" @ ")?; - let stack_part = rest.get(at_pos + 3..)?.trim(); - if stack_part.is_empty() { - continue; - } - let addrs: Vec<&str> = stack_part.split_whitespace().collect(); - if addrs.is_empty() { + + // heap_v2 threaded format: + // "@ addr1 addr2 ..." (stack for the following t*: lines) + if rest.starts_with('@') { + let stack_part = rest.trim_start_matches('@').trim(); + if stack_part.is_empty() { + continue; + } + let addrs: Vec<&str> = stack_part.split_whitespace().collect(); + if addrs.is_empty() { + continue; + } + for (i, addr) in addrs.iter().enumerate() { + let extended = hex_extend(addr)?; + let fixed = if i == 0 { + extended + } else { + address_sub_one(&extended).unwrap_or(extended) + }; + pcs.insert(fixed); + } continue; } - for (i, addr) in addrs.iter().enumerate() { - let extended = hex_extend(addr)?; - let fixed = if i == 0 { - extended - } else { - address_sub_one(&extended).unwrap_or(extended) - }; - pcs.insert(fixed); + + // heap profile entry format: + // optional whitespace, count1: bytes1 [ count2: bytes2 ] @ addr1 addr2 ... + if let Some(at_pos) = rest.find(" @ ") { + let stack_part = rest.get(at_pos + 3..)?.trim(); + if stack_part.is_empty() { + continue; + } + let addrs: Vec<&str> = stack_part.split_whitespace().collect(); + if addrs.is_empty() { + continue; + } + for (i, addr) in addrs.iter().enumerate() { + let extended = hex_extend(addr)?; + let fixed = if i == 0 { + extended + } else { + address_sub_one(&extended).unwrap_or(extended) + }; + pcs.insert(fixed); + } } } @@ -280,6 +316,20 @@ mod tests { assert!(pcs.iter().any(|s| s.contains("12345") || s.ends_with("12345"))); } + #[test] + fn test_parse_heap_v2_at_lines() { + let body = b"heap_v2/524288 + t*: 1: 2 [ 0: 0] +@ 0x12345 0x67890 0xabc +"; + let pcs = parse_heap_profile_for_pcs(body).unwrap(); + assert!(pcs.len() >= 2); + // First addr is not FixCallerAddresses-adjusted. + assert!(pcs.iter().any(|s| s.ends_with("12345"))); + // Second+ are adjusted (subtract 1), so 0x67890 -> 0x6788f + assert!(pcs.iter().any(|s| s.ends_with("6788f"))); + } + #[test] fn test_base_url_from_heap_url() { assert_eq!( From 71b0006766d059a4a0ce8946745365a29873e316 Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Sun, 22 Mar 2026 17:56:28 +0800 Subject: [PATCH 06/26] topsql: add manager discovery and topru keyspace routing --- src/common/keyspace_cluster.rs | 331 +++++++++ src/common/mod.rs | 1 + src/common/topology/fetch/mod.rs | 49 +- src/common/topology/fetch/tidb_manager.rs | 373 ++++++++++ src/sinks/topsql_data_deltalake/arch.md | 1 + src/sinks/topsql_data_deltalake/mod.rs | 41 +- src/sinks/topsql_data_deltalake/processor.rs | 672 ++++++++++++------- src/sources/keyviz.rs | 2 + src/sources/system_tables/controller.rs | 4 + src/sources/topsql/controller.rs | 4 + src/sources/topsql/mod.rs | 12 + src/sources/topsql_v2/arch.md | 7 + src/sources/topsql_v2/controller.rs | 4 + src/sources/topsql_v2/mod.rs | 12 + 14 files changed, 1261 insertions(+), 252 deletions(-) create mode 100644 src/common/keyspace_cluster.rs create mode 100644 src/common/topology/fetch/tidb_manager.rs diff --git a/src/common/keyspace_cluster.rs b/src/common/keyspace_cluster.rs new file mode 100644 index 0000000..41a5249 --- /dev/null +++ b/src/common/keyspace_cluster.rs @@ -0,0 +1,331 @@ +use std::collections::HashMap; +use std::fs; +use std::sync::Arc; +use std::time::Duration; + +use reqwest::{Certificate, Client, Identity, StatusCode}; +use serde::Deserialize; +use tokio::sync::Mutex; +use url::form_urlencoded::byte_serialize; +use vector_lib::tls::TlsConfig; + +type BoxError = Box; + +const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); +const CONNECT_TIMEOUT: Duration = Duration::from_secs(3); + +const ORG_ID_KEYS: &[&str] = &["serverless_tenant_id"]; +const CLUSTER_ID_KEYS: &[&str] = &["serverless_cluster_id"]; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct KeyspaceRoute { + pub org_id: String, + pub cluster_id: String, +} + +#[derive(Clone)] +pub struct PdKeyspaceResolver { + base_url: String, + client: Client, + cache: Arc>>, +} + +#[derive(Debug, Deserialize)] +struct PdKeyspaceMetadata { + config: Option>, +} + +impl PdKeyspaceResolver { + pub fn new(pd_address: impl Into, pd_tls: Option) -> Result { + let client = build_http_client(pd_tls.as_ref())?; + Ok(Self::new_with_client(pd_address, pd_tls.as_ref(), client)) + } + + pub fn new_with_client( + pd_address: impl Into, + pd_tls: Option<&TlsConfig>, + client: Client, + ) -> Self { + Self { + base_url: normalize_pd_address(&pd_address.into(), pd_tls.is_some()), + client, + cache: Arc::new(Mutex::new(HashMap::new())), + } + } + + pub async fn resolve_keyspace( + &self, + keyspace_name: &str, + ) -> Result, BoxError> { + if keyspace_name.is_empty() { + return Ok(None); + } + + if let Some(cached) = self.cache.lock().await.get(keyspace_name).cloned() { + return Ok(Some(cached)); + } + + let encoded_keyspace = byte_serialize(keyspace_name.as_bytes()).collect::(); + let response = self + .client + .get(format!( + "{}/pd/api/v2/keyspaces/{}", + self.base_url, encoded_keyspace + )) + .send() + .await?; + + match response.status() { + StatusCode::NOT_FOUND => return Ok(None), + status if !status.is_success() => { + let body = response.text().await.unwrap_or_default(); + if is_not_found_body(&body) { + return Ok(None); + } + return Err(format!( + "pd keyspace lookup failed for {} with status {}: {}", + keyspace_name, status, body + ) + .into()); + } + _ => {} + } + + let metadata: PdKeyspaceMetadata = response.json().await?; + let route = metadata.config.as_ref().and_then(extract_route_from_config); + + if let Some(route) = route.clone() { + self.cache + .lock() + .await + .insert(keyspace_name.to_string(), route); + } + + Ok(route) + } +} + +fn build_http_client(pd_tls: Option<&TlsConfig>) -> Result { + let mut builder = Client::builder() + .timeout(REQUEST_TIMEOUT) + .connect_timeout(CONNECT_TIMEOUT); + + if let Some(tls) = pd_tls { + builder = builder + .danger_accept_invalid_certs(!tls.verify_certificate.unwrap_or(true)) + .danger_accept_invalid_hostnames(!tls.verify_hostname.unwrap_or(true)); + + if let Some(ca_file) = tls.ca_file.as_ref() { + let ca = fs::read(ca_file)?; + builder = builder.add_root_certificate(Certificate::from_pem(&ca)?); + } + + match (tls.crt_file.as_ref(), tls.key_file.as_ref()) { + (Some(crt_file), Some(key_file)) => { + let crt = fs::read(crt_file)?; + let key = fs::read(key_file)?; + builder = builder.identity(Identity::from_pkcs8_pem(&crt, &key)?); + } + (None, None) => {} + _ => { + return Err( + "pd_tls.crt_file and pd_tls.key_file must both be set when client TLS is enabled" + .into(), + ); + } + } + } + + Ok(builder.build()?) +} + +fn normalize_pd_address(pd_address: &str, use_tls: bool) -> String { + let trimmed = pd_address.trim().trim_end_matches('/'); + if trimmed.starts_with("http://") || trimmed.starts_with("https://") { + trimmed.to_string() + } else if use_tls { + format!("https://{}", trimmed) + } else { + format!("http://{}", trimmed) + } +} + +fn is_not_found_body(body: &str) -> bool { + let lower = body.to_ascii_lowercase(); + lower.contains("not found") +} + +fn extract_route_from_config(config: &HashMap) -> Option { + let org_id = find_config_value(config, ORG_ID_KEYS)?; + let cluster_id = find_config_value(config, CLUSTER_ID_KEYS)?; + + if org_id.is_empty() || cluster_id.is_empty() { + return None; + } + + Some(KeyspaceRoute { + org_id: org_id.to_string(), + cluster_id: cluster_id.to_string(), + }) +} + +fn find_config_value<'a>(config: &'a HashMap, keys: &[&str]) -> Option<&'a str> { + keys.iter() + .find_map(|key| config.get(*key)) + .map(String::as_str) + .filter(|value| !value.is_empty()) +} + +#[cfg(test)] +mod tests { + use std::convert::Infallible; + use std::net::TcpListener; + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::Arc; + + use hyper::service::{make_service_fn, service_fn}; + use hyper::{Body, Request, Response, Server, StatusCode as HyperStatusCode}; + + use super::*; + + #[test] + fn normalize_pd_address_adds_expected_scheme() { + assert_eq!(normalize_pd_address("pd:2379/", false), "http://pd:2379"); + assert_eq!(normalize_pd_address("pd:2379/", true), "https://pd:2379"); + assert_eq!( + normalize_pd_address("https://pd:2379", false), + "https://pd:2379" + ); + } + + #[test] + fn extract_route_from_config_uses_serverless_route_keys() { + let mut serverless_config = HashMap::new(); + serverless_config.insert("serverless_tenant_id".to_string(), "30018".to_string()); + serverless_config.insert( + "serverless_cluster_id".to_string(), + "10155668891296301432".to_string(), + ); + + assert_eq!( + extract_route_from_config(&serverless_config), + Some(KeyspaceRoute { + org_id: "30018".to_string(), + cluster_id: "10155668891296301432".to_string(), + }) + ); + } + + #[test] + fn extract_route_from_config_ignores_legacy_route_keys() { + let mut legacy_config = HashMap::new(); + legacy_config.insert("tenant_id".to_string(), "30018".to_string()); + legacy_config.insert( + "tidb_cluster_id".to_string(), + "10762701230946915645".to_string(), + ); + + assert_eq!(extract_route_from_config(&legacy_config), None); + } + + #[tokio::test] + async fn resolve_keyspace_uses_pd_keyspace_api_and_caches_result() { + let request_count = Arc::new(AtomicUsize::new(0)); + let counter = Arc::clone(&request_count); + + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let address = listener.local_addr().unwrap(); + let server = Server::from_tcp(listener) + .unwrap() + .serve(make_service_fn(move |_| { + let counter = Arc::clone(&counter); + async move { + Ok::<_, Infallible>(service_fn(move |request: Request| { + let counter = Arc::clone(&counter); + async move { + counter.fetch_add(1, Ordering::SeqCst); + assert_eq!(request.uri().path(), "/pd/api/v2/keyspaces/test_keyspace"); + Ok::<_, Infallible>(Response::new(Body::from( + r#"{"config":{"serverless_tenant_id":"30018","serverless_cluster_id":"10762701230946915645"}}"#, + ))) + } + })) + } + })); + let server_handle = tokio::spawn(server); + + let client = Client::builder().no_proxy().build().unwrap(); + let resolver = + PdKeyspaceResolver::new_with_client(format!("http://{}", address), None, client); + + let first = resolver.resolve_keyspace("test_keyspace").await.unwrap(); + let second = resolver.resolve_keyspace("test_keyspace").await.unwrap(); + + assert_eq!( + first, + Some(KeyspaceRoute { + org_id: "30018".to_string(), + cluster_id: "10762701230946915645".to_string(), + }) + ); + assert_eq!(second, first); + assert_eq!(request_count.load(Ordering::SeqCst), 1); + + server_handle.abort(); + } + + #[tokio::test] + async fn resolve_keyspace_returns_none_for_missing_route() { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let address = listener.local_addr().unwrap(); + let server = + Server::from_tcp(listener) + .unwrap() + .serve(make_service_fn(move |_| async move { + Ok::<_, Infallible>(service_fn(move |_request: Request| async move { + Ok::<_, Infallible>(Response::new(Body::from( + r#"{"config":{"tenant_id":"30018"}}"#, + ))) + })) + })); + let server_handle = tokio::spawn(server); + + let client = Client::builder().no_proxy().build().unwrap(); + let resolver = + PdKeyspaceResolver::new_with_client(format!("http://{}", address), None, client); + let route = resolver.resolve_keyspace("test_keyspace").await.unwrap(); + + assert_eq!(route, None); + + server_handle.abort(); + } + + #[tokio::test] + async fn resolve_keyspace_treats_not_found_error_body_as_empty_result() { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let address = listener.local_addr().unwrap(); + let server = + Server::from_tcp(listener) + .unwrap() + .serve(make_service_fn(move |_| async move { + Ok::<_, Infallible>(service_fn(move |_request: Request| async move { + Ok::<_, Infallible>( + Response::builder() + .status(HyperStatusCode::INTERNAL_SERVER_ERROR) + .body(Body::from("keyspace not found")) + .unwrap(), + ) + })) + })); + let server_handle = tokio::spawn(server); + + let client = Client::builder().no_proxy().build().unwrap(); + let resolver = + PdKeyspaceResolver::new_with_client(format!("http://{}", address), None, client); + let route = resolver.resolve_keyspace("missing_keyspace").await.unwrap(); + + assert_eq!(route, None); + + server_handle.abort(); + } +} diff --git a/src/common/mod.rs b/src/common/mod.rs index 764714f..7397729 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -2,4 +2,5 @@ pub mod checkpointer; pub mod deltalake_s3; pub mod deltalake_writer; pub mod features; +pub mod keyspace_cluster; pub mod topology; diff --git a/src/common/topology/fetch/mod.rs b/src/common/topology/fetch/mod.rs index 360dfd4..4403649 100644 --- a/src/common/topology/fetch/mod.rs +++ b/src/common/topology/fetch/mod.rs @@ -2,6 +2,7 @@ mod models; mod pd; mod store; mod tidb; +mod tidb_manager; mod utils; pub mod tidb_nextgen; @@ -43,6 +44,8 @@ pub enum FetchError { FetchPDTopology { source: pd::FetchError }, #[snafu(display("Failed to fetch tidb topology: {}", source))] FetchTiDBTopology { source: tidb::FetchError }, + #[snafu(display("Failed to fetch tidb topology from manager server: {}", source))] + FetchTiDBFromManagerServerTopology { source: tidb_manager::FetchError }, #[snafu(display("Failed to fetch store topology: {}", source))] FetchStoreTopology { source: store::FetchError }, #[snafu(display("Failed to fetch tidb nextgen topology: {}", source))] @@ -56,6 +59,8 @@ pub enum FetchError { // Legacy topology fetcher pub struct LegacyTopologyFetcher { pd_address: String, + manager_server_address: Option, + tidb_namespace: Option, http_client: HttpClient, pub etcd_client: etcd_client::Client, } @@ -63,15 +68,22 @@ pub struct LegacyTopologyFetcher { impl LegacyTopologyFetcher { pub async fn new( pd_address: String, + manager_server_address: Option, + tidb_namespace: Option, tls_config: Option, proxy_config: &ProxyConfig, ) -> Result { let pd_address = Self::polish_address(pd_address, &tls_config)?; + let manager_server_address = manager_server_address + .map(Self::polish_manager_server_address) + .transpose()?; let http_client = Self::build_http_client(tls_config.as_ref(), proxy_config)?; let etcd_client = Self::build_etcd_client(&pd_address, &tls_config).await?; Ok(Self { pd_address, + manager_server_address, + tidb_namespace, http_client, etcd_client, }) @@ -85,10 +97,21 @@ impl LegacyTopologyFetcher { .get_up_pds(components) .await .context(FetchPDTopologySnafu)?; - tidb::TiDBTopologyFetcher::new(&mut self.etcd_client) + if let Some(manager_server_address) = self.manager_server_address.as_deref() { + tidb_manager::TiDBManagerTopologyFetcher::new( + manager_server_address, + self.tidb_namespace.as_deref(), + &self.http_client, + ) .get_up_tidbs(components) .await - .context(FetchTiDBTopologySnafu)?; + .context(FetchTiDBFromManagerServerTopologySnafu)?; + } else { + tidb::TiDBTopologyFetcher::new(&mut self.etcd_client) + .get_up_tidbs(components) + .await + .context(FetchTiDBTopologySnafu)?; + } store::StoreTopologyFetcher::new(&self.pd_address, &self.http_client) .get_up_stores(components) .await @@ -114,6 +137,17 @@ impl LegacyTopologyFetcher { Ok(address) } + fn polish_manager_server_address(mut address: String) -> Result { + let uri: hyper::Uri = address.parse().context(ParseAddressSnafu)?; + if uri.scheme().is_none() { + address = format!("http://{address}"); + } + if address.ends_with('/') { + address.pop(); + } + Ok(address) + } + fn build_http_client( tls_config: Option<&TlsConfig>, proxy_config: &ProxyConfig, @@ -234,6 +268,8 @@ impl TopologyFetcher { /// Create a new topology fetcher based on the current feature configuration pub async fn new( pd_address: Option, + manager_server_address: Option, + tidb_namespace: Option, tls_config: Option, proxy_config: &ProxyConfig, tidb_group: Option, @@ -252,7 +288,14 @@ impl TopologyFetcher { let pd_address = pd_address.ok_or_else(|| FetchError::ConfigurationError { message: "PD address is required in legacy mode".to_string(), })?; - let fetcher = LegacyTopologyFetcher::new(pd_address, tls_config, proxy_config).await?; + let fetcher = LegacyTopologyFetcher::new( + pd_address, + manager_server_address, + tidb_namespace, + tls_config, + proxy_config, + ) + .await?; Ok(Self { inner: TopologyFetcherImpl::Legacy(Box::new(fetcher)), }) diff --git a/src/common/topology/fetch/tidb_manager.rs b/src/common/topology/fetch/tidb_manager.rs new file mode 100644 index 0000000..21a94b3 --- /dev/null +++ b/src/common/topology/fetch/tidb_manager.rs @@ -0,0 +1,373 @@ +use std::collections::HashSet; + +use serde_json::{Map, Value}; +use snafu::{ResultExt, Snafu}; +use vector::http::HttpClient; + +use crate::common::topology::fetch::utils; +use crate::common::topology::{Component, InstanceType}; + +const GET_ACTIVE_TIDB_PATH: &str = "/api/tidb/get_active_tidb"; +const DEFAULT_TIDB_PRIMARY_PORT: u16 = 4000; +const DEFAULT_TIDB_STATUS_PORT: u16 = 10080; +const MAX_RESPONSE_DEPTH: usize = 8; + +#[derive(Debug, Snafu)] +pub enum FetchError { + #[snafu(display("Failed to build request: {}", source))] + BuildRequest { source: http::Error }, + #[snafu(display("Failed to get active tidb addresses from manager server: {}", source))] + GetActiveTiDBs { source: vector::http::HttpError }, + #[snafu(display("Failed to read active tidb response bytes: {}", source))] + GetActiveTiDBsBytes { source: hyper::Error }, + #[snafu(display("Failed to parse active tidb response JSON text: {}", source))] + ActiveTiDBJsonFromStr { source: serde_json::Error }, + #[snafu(display("Invalid manager server response: {}", message))] + InvalidManagerResponse { message: String }, + #[snafu(display("Failed to parse tidb host from manager response: {}", source))] + ParseTiDBHost { source: utils::ParseError }, +} + +#[derive(Debug, Clone, Eq, PartialEq)] +struct ActiveTiDBAddress { + host: String, + port: Option, + status_port: Option, + hostname: Option, +} + +pub struct TiDBManagerTopologyFetcher<'a> { + manager_server_address: &'a str, + tidb_namespace: Option<&'a str>, + http_client: &'a HttpClient, +} + +impl<'a> TiDBManagerTopologyFetcher<'a> { + pub fn new( + manager_server_address: &'a str, + tidb_namespace: Option<&'a str>, + http_client: &'a HttpClient, + ) -> Self { + Self { + manager_server_address, + tidb_namespace, + http_client, + } + } + + pub async fn get_up_tidbs( + &self, + components: &mut HashSet, + ) -> Result<(), FetchError> { + let active_tidb_addresses = self.fetch_active_tidb_addresses().await?; + if !active_tidb_addresses.is_empty() { + info!( + message = "Fetched active TiDB instances from manager server", + manager_server_address = self.manager_server_address, + tidb_namespace = ?self.tidb_namespace, + tidb_count = active_tidb_addresses.len() + ); + } + + for active_tidb in active_tidb_addresses { + let (host, primary_port) = + Self::parse_tidb_host_and_primary(&active_tidb.host, active_tidb.port)?; + let secondary_port = active_tidb.status_port.unwrap_or(DEFAULT_TIDB_STATUS_PORT); + + components.insert(Component { + instance_type: InstanceType::TiDB, + host, + primary_port, + secondary_port, + instance_name: active_tidb.hostname.filter(|name| !name.trim().is_empty()), + }); + } + + Ok(()) + } + + async fn fetch_active_tidb_addresses(&self) -> Result, FetchError> { + let Some(endpoint_url) = self.active_tidb_endpoint_url() else { + return Ok(Vec::new()); + }; + + let req = http::Request::get(endpoint_url) + .body(hyper::Body::empty()) + .context(BuildRequestSnafu)?; + + let res = self + .http_client + .send(req) + .await + .context(GetActiveTiDBsSnafu)?; + let bytes = hyper::body::to_bytes(res.into_body()) + .await + .context(GetActiveTiDBsBytesSnafu)?; + + Self::parse_active_tidb_addresses_response(&bytes) + } + + fn active_tidb_endpoint_url(&self) -> Option { + let namespaces = Self::normalize_namespaces(self.tidb_namespace)?; + Some(Self::build_active_tidb_endpoint_url( + self.manager_server_address, + &namespaces, + )) + } + + fn normalize_namespaces(namespaces: Option<&str>) -> Option { + let namespaces = namespaces?; + let normalized = namespaces + .split(',') + .map(str::trim) + .filter(|ns| !ns.is_empty()) + .collect::>(); + if normalized.is_empty() { + None + } else { + Some(normalized.join(",")) + } + } + + fn build_active_tidb_endpoint_url(manager_server_address: &str, namespaces: &str) -> String { + let mut endpoint = if manager_server_address.ends_with(GET_ACTIVE_TIDB_PATH) { + manager_server_address.to_owned() + } else { + format!("{manager_server_address}{GET_ACTIVE_TIDB_PATH}") + }; + endpoint.push_str("?namespace="); + endpoint.push_str(namespaces); + endpoint + } + + fn parse_tidb_host_and_primary( + host_or_address: &str, + explicit_port: Option, + ) -> Result<(String, u16), FetchError> { + let host_or_address = host_or_address.trim_end_matches('/'); + if let Ok((host, parsed_port)) = utils::parse_host_port(host_or_address) { + return Ok((host, explicit_port.unwrap_or(parsed_port))); + } + + let default_address = format!("{host_or_address}:{DEFAULT_TIDB_PRIMARY_PORT}"); + let (host, _) = utils::parse_host_port(&default_address).context(ParseTiDBHostSnafu)?; + Ok((host, explicit_port.unwrap_or(DEFAULT_TIDB_PRIMARY_PORT))) + } + + fn parse_active_tidb_addresses_response( + bytes: &[u8], + ) -> Result, FetchError> { + let value = serde_json::from_slice::(bytes).context(ActiveTiDBJsonFromStrSnafu)?; + let addresses = Self::extract_active_tidb_addresses(&value, 0)?; + + if addresses.is_empty() { + return Err(FetchError::InvalidManagerResponse { + message: "no active tidb addresses found".to_owned(), + }); + } + + Ok(addresses) + } + + fn extract_active_tidb_addresses( + value: &Value, + depth: usize, + ) -> Result, FetchError> { + if depth > MAX_RESPONSE_DEPTH { + return Err(FetchError::InvalidManagerResponse { + message: "response nesting is too deep".to_owned(), + }); + } + + match value { + Value::String(host) => Ok(vec![ActiveTiDBAddress { + host: host.clone(), + port: None, + status_port: None, + hostname: None, + }]), + Value::Array(items) => { + let mut addresses = Vec::new(); + for item in items { + addresses.extend(Self::extract_active_tidb_addresses(item, depth + 1)?); + } + Ok(addresses) + } + Value::Object(obj) => { + if let Some(address) = Self::extract_active_tidb_address_from_object(obj) { + return Ok(vec![address]); + } + + for key in [ + "data", + "result", + "active_tidb_addresses", + "tidb_addresses", + "active_tidbs", + "tidbs", + "addresses", + "instances", + "items", + "nodes", + "list", + ] { + if let Some(next_value) = obj.get(key) { + let addresses = Self::extract_active_tidb_addresses(next_value, depth + 1)?; + if !addresses.is_empty() { + return Ok(addresses); + } + } + } + + Ok(Vec::new()) + } + _ => Ok(Vec::new()), + } + } + + fn extract_active_tidb_address_from_object( + obj: &Map, + ) -> Option { + let host = Self::extract_string_field( + obj, + &["host", "address", "tidb_address", "active_tidb_address"], + )?; + let port = Self::extract_u16_field(obj, &["port", "primary_port"]); + let status_port = Self::extract_u16_field(obj, &["status_port", "secondary_port"]); + let hostname = Self::extract_string_field(obj, &["hostname", "pod_name", "instance_name"]); + + Some(ActiveTiDBAddress { + host, + port, + status_port, + hostname, + }) + } + + fn extract_string_field(obj: &Map, keys: &[&str]) -> Option { + keys.iter() + .find_map(|key| obj.get(*key).and_then(Value::as_str).map(str::to_owned)) + } + + fn extract_u16_field(obj: &Map, keys: &[&str]) -> Option { + keys.iter().find_map(|key| { + obj.get(*key) + .and_then(Value::as_u64) + .and_then(|raw| u16::try_from(raw).ok()) + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_response_new_schema() { + let bytes = br#"[ + {"host":"10.0.0.1","port":4000,"status_port":10080,"hostname":"tidb-0"}, + {"host":"10.0.0.2","port":4000,"status_port":10080,"hostname":"tidb-1"} + ]"#; + let addresses = + TiDBManagerTopologyFetcher::parse_active_tidb_addresses_response(bytes).unwrap(); + + assert_eq!( + addresses, + vec![ + ActiveTiDBAddress { + host: "10.0.0.1".to_owned(), + port: Some(4000), + status_port: Some(10080), + hostname: Some("tidb-0".to_owned()), + }, + ActiveTiDBAddress { + host: "10.0.0.2".to_owned(), + port: Some(4000), + status_port: Some(10080), + hostname: Some("tidb-1".to_owned()), + } + ] + ); + } + + #[test] + fn parse_response_invalid_format() { + let bytes = br#"{"code":0,"message":"ok"}"#; + let err = TiDBManagerTopologyFetcher::parse_active_tidb_addresses_response(bytes) + .expect_err("expected invalid manager response"); + assert!(matches!(err, FetchError::InvalidManagerResponse { .. })); + } + + #[test] + fn parse_tidb_host_and_primary_with_address() { + let (host, primary_port) = + TiDBManagerTopologyFetcher::parse_tidb_host_and_primary("10.0.0.1:4100", None).unwrap(); + assert_eq!(host, "10.0.0.1"); + assert_eq!(primary_port, 4100); + } + + #[test] + fn parse_tidb_host_and_primary_with_host_only() { + let (host, primary_port) = + TiDBManagerTopologyFetcher::parse_tidb_host_and_primary("10.0.0.1", None).unwrap(); + assert_eq!(host, "10.0.0.1"); + assert_eq!(primary_port, DEFAULT_TIDB_PRIMARY_PORT); + } + + #[test] + fn parse_tidb_host_and_primary_with_explicit_port() { + let (host, primary_port) = + TiDBManagerTopologyFetcher::parse_tidb_host_and_primary("10.0.0.1", Some(4200)) + .unwrap(); + assert_eq!(host, "10.0.0.1"); + assert_eq!(primary_port, 4200); + } + + #[test] + fn build_endpoint_url_with_namespaces() { + let endpoint = TiDBManagerTopologyFetcher::build_active_tidb_endpoint_url( + "http://manager:8080", + "super-vip-tidb-pool,canary-super-vip-tidb-pool", + ); + assert_eq!( + endpoint, + "http://manager:8080/api/tidb/get_active_tidb?namespace=super-vip-tidb-pool,canary-super-vip-tidb-pool" + ); + } + + #[test] + fn build_endpoint_url_with_full_path() { + let endpoint = TiDBManagerTopologyFetcher::build_active_tidb_endpoint_url( + "http://manager:8080/api/tidb/get_active_tidb", + "super-vip-tidb-pool,canary-super-vip-tidb-pool", + ); + assert_eq!( + endpoint, + "http://manager:8080/api/tidb/get_active_tidb?namespace=super-vip-tidb-pool,canary-super-vip-tidb-pool" + ); + } + + #[test] + fn normalize_namespaces_none_or_empty() { + assert_eq!(TiDBManagerTopologyFetcher::normalize_namespaces(None), None); + assert_eq!( + TiDBManagerTopologyFetcher::normalize_namespaces(Some("")), + None + ); + assert_eq!( + TiDBManagerTopologyFetcher::normalize_namespaces(Some(" , ")), + None + ); + } + + #[test] + fn normalize_namespaces_trim_and_filter() { + let normalized = TiDBManagerTopologyFetcher::normalize_namespaces(Some( + " super-vip-tidb-pool, canary-super-vip-tidb-pool , ", + )); + assert_eq!( + normalized.as_deref(), + Some("super-vip-tidb-pool,canary-super-vip-tidb-pool") + ); + } +} diff --git a/src/sinks/topsql_data_deltalake/arch.md b/src/sinks/topsql_data_deltalake/arch.md index 999c95d..1a77bdf 100644 --- a/src/sinks/topsql_data_deltalake/arch.md +++ b/src/sinks/topsql_data_deltalake/arch.md @@ -56,6 +56,7 @@ pub struct TopSQLDataDeltaLakeConfig { - **SQL Digest Grouping**: Group by SQL digest - **Time Partitioning**: Partition by execution time - **Schema Optimization**: Optimized schema for TopSQL data +- **Keyspace-based Routing**: Optional PD keyspace lookup can prepend `org=/cluster=` path segments before the table layout, which is especially useful for `topru` data written to shared S3 prefixes ## Dependencies diff --git a/src/sinks/topsql_data_deltalake/mod.rs b/src/sinks/topsql_data_deltalake/mod.rs index 21a65b8..e18be0b 100644 --- a/src/sinks/topsql_data_deltalake/mod.rs +++ b/src/sinks/topsql_data_deltalake/mod.rs @@ -25,8 +25,13 @@ use tracing::{error, info, warn}; mod processor; // Import default functions from common module -use crate::common::deltalake_writer::{default_batch_size, default_timeout_secs}; use crate::common::deltalake_s3; +use crate::common::deltalake_writer::{default_batch_size, default_timeout_secs}; +use crate::common::keyspace_cluster::PdKeyspaceResolver; + +pub const fn default_enable_keyspace_cluster_mapping() -> bool { + false +} pub const fn default_max_delay_secs() -> u64 { 180 @@ -55,6 +60,16 @@ pub struct DeltaLakeConfig { #[serde(default = "default_max_delay_secs")] pub max_delay_secs: u64, + /// Whether to resolve keyspace to org/cluster path segments through PD. + #[serde(default = "default_enable_keyspace_cluster_mapping")] + pub enable_keyspace_cluster_mapping: bool, + + /// PD address used to resolve keyspace to org/cluster path segments. + pub pd_address: Option, + + /// TLS configuration for PD keyspace lookup. + pub pd_tls: Option, + /// Storage options for cloud storage pub storage_options: Option>, @@ -100,6 +115,9 @@ impl GenerateConfig for DeltaLakeConfig { batch_size: default_batch_size(), timeout_secs: default_timeout_secs(), max_delay_secs: default_max_delay_secs(), + enable_keyspace_cluster_mapping: default_enable_keyspace_cluster_mapping(), + pd_address: None, + pd_tls: None, storage_options: None, bucket: None, options: None, @@ -219,12 +237,31 @@ impl DeltaLakeConfig { info!("No S3 service available - using default storage options only"); } + let keyspace_route_resolver = if self.enable_keyspace_cluster_mapping { + let pd_address = self.pd_address.as_deref().ok_or_else(|| { + vector::Error::from( + "pd_address is required when enable_keyspace_cluster_mapping is true", + ) + })?; + Some( + PdKeyspaceResolver::new(pd_address, self.pd_tls.clone()).map_err(|error| { + vector::Error::from(format!( + "failed to build PD keyspace resolver from pd_address: {}", + error + )) + })?, + ) + } else { + None + }; + let sink = TopSQLDeltaLakeSink::new( base_path, table_configs, write_config, self.max_delay_secs, Some(storage_options), + keyspace_route_resolver, ); Ok(VectorSink::from_event_streamsink(sink)) @@ -272,4 +309,4 @@ mod tests { fn generate_config() { vector::test_util::test_generate_config::(); } -} \ No newline at end of file +} diff --git a/src/sinks/topsql_data_deltalake/processor.rs b/src/sinks/topsql_data_deltalake/processor.rs index 2620411..a08e0df 100644 --- a/src/sinks/topsql_data_deltalake/processor.rs +++ b/src/sinks/topsql_data_deltalake/processor.rs @@ -1,23 +1,24 @@ use std::collections::HashMap; use std::path::PathBuf; use std::sync::Arc; +use std::time::Duration; use futures::{stream::BoxStream, StreamExt}; -use tokio::sync::Mutex; use tokio::sync::mpsc; -use vector_lib::event::Event; +use tokio::sync::Mutex; +use vector_lib::event::{Event, LogEvent}; use vector_lib::sink::StreamSink; use crate::common::deltalake_writer::{DeltaLakeWriter, DeltaTableConfig, WriteConfig}; +use crate::common::keyspace_cluster::{KeyspaceRoute, PdKeyspaceResolver}; use crate::sources::topsql_v2::upstream::consts::{ - LABEL_PLAN_DIGEST, LABEL_REGION_ID, LABEL_INSTANCE_KEY, LABEL_SQL_DIGEST, LABEL_TIMESTAMPS, - LABEL_DATE, LABEL_KEYSPACE, LABEL_TAG_LABEL, LABEL_DB_NAME, LABEL_TABLE_NAME, LABEL_TABLE_ID, - LABEL_SOURCE_TABLE, LABEL_USER, SOURCE_TABLE_TOPRU, - METRIC_NAME_CPU_TIME_MS, METRIC_NAME_LOGICAL_READ_BYTES, METRIC_NAME_LOGICAL_WRITE_BYTES, + LABEL_DATE, LABEL_DB_NAME, LABEL_INSTANCE_KEY, LABEL_KEYSPACE, LABEL_PLAN_DIGEST, + LABEL_REGION_ID, LABEL_SOURCE_TABLE, LABEL_SQL_DIGEST, LABEL_TABLE_ID, LABEL_TABLE_NAME, + LABEL_TAG_LABEL, LABEL_TIMESTAMPS, LABEL_USER, METRIC_NAME_CPU_TIME_MS, METRIC_NAME_EXEC_COUNT, + METRIC_NAME_EXEC_DURATION, METRIC_NAME_LOGICAL_READ_BYTES, METRIC_NAME_LOGICAL_WRITE_BYTES, METRIC_NAME_NETWORK_IN_BYTES, METRIC_NAME_NETWORK_OUT_BYTES, METRIC_NAME_READ_KEYS, - METRIC_NAME_STMT_EXEC_COUNT, METRIC_NAME_WRITE_KEYS, - METRIC_NAME_STMT_DURATION_COUNT, METRIC_NAME_STMT_DURATION_SUM_NS, - METRIC_NAME_TOTAL_RU, METRIC_NAME_EXEC_COUNT, METRIC_NAME_EXEC_DURATION, + METRIC_NAME_STMT_DURATION_COUNT, METRIC_NAME_STMT_DURATION_SUM_NS, METRIC_NAME_STMT_EXEC_COUNT, + METRIC_NAME_TOTAL_RU, METRIC_NAME_WRITE_KEYS, SOURCE_TABLE_TOPRU, }; use lazy_static::lazy_static; @@ -72,7 +73,7 @@ lazy_static! { "mysql_type": "text", "is_nullable": true }), - ); + ); schema_info.insert( LABEL_SQL_DIGEST.into(), serde_json::json!({ @@ -247,6 +248,8 @@ lazy_static! { }; } +const ROUTE_RESOLUTION_RETRY_DELAY: Duration = Duration::from_secs(5); + /// Delta Lake sink processor pub struct TopSQLDeltaLakeSink { base_path: PathBuf, @@ -254,10 +257,17 @@ pub struct TopSQLDeltaLakeSink { write_config: WriteConfig, max_delay_secs: u64, storage_options: Option>, - writers: Arc>>, + keyspace_route_resolver: Option, + writers: Arc>>, tx: Arc>>>, } +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +struct WriterKey { + table_name: String, + table_path: PathBuf, +} + impl TopSQLDeltaLakeSink { /// Create a new Delta Lake sink pub fn new( @@ -266,11 +276,12 @@ impl TopSQLDeltaLakeSink { write_config: WriteConfig, max_delay_secs: u64, storage_options: Option>, + keyspace_route_resolver: Option, ) -> Self { // Create a channel with capacity 1 let (tx, rx) = mpsc::channel(1); let tx = Arc::new(tx); - + // Create sink instance let sink = Arc::new(Self { base_path, @@ -278,16 +289,17 @@ impl TopSQLDeltaLakeSink { write_config, max_delay_secs, storage_options, + keyspace_route_resolver, writers: Arc::new(Mutex::new(HashMap::new())), tx: Arc::clone(&tx), }); - + // Spawn process_events_loop as a separate tokio task to avoid blocking let sink_clone = Arc::clone(&sink); tokio::spawn(async move { sink_clone.process_events_loop(rx).await; }); - + // Return the sink (Arc::try_unwrap will fail because tokio task holds a reference, // so we use unsafe to manually get the inner value without decrementing the reference count) // Safety: We know there's exactly one more reference (the tokio task), @@ -306,6 +318,7 @@ impl TopSQLDeltaLakeSink { write_config: inner_ref.write_config.clone(), max_delay_secs: inner_ref.max_delay_secs, storage_options: inner_ref.storage_options.clone(), + keyspace_route_resolver: inner_ref.keyspace_route_resolver.clone(), writers: Arc::clone(&inner_ref.writers), tx: Arc::clone(&inner_ref.tx), }; @@ -314,7 +327,7 @@ impl TopSQLDeltaLakeSink { inner_value } } - + #[cfg(test)] /// Create a new Delta Lake sink for testing, returning both the sink and the receiver /// The receiver can be used to verify messages sent through the channel @@ -325,11 +338,15 @@ impl TopSQLDeltaLakeSink { write_config: WriteConfig, max_delay_secs: u64, storage_options: Option>, + keyspace_route_resolver: Option, ) -> (Self, mpsc::Receiver>>) { // Create a channel with capacity 1 - let (tx, rx): (mpsc::Sender>>, mpsc::Receiver>>) = mpsc::channel(1); + let (tx, rx): ( + mpsc::Sender>>, + mpsc::Receiver>>, + ) = mpsc::channel(1); let tx = Arc::new(tx); - + // Create sink instance (without starting process_events_loop) let sink = Self { base_path, @@ -337,22 +354,34 @@ impl TopSQLDeltaLakeSink { write_config, max_delay_secs, storage_options, + keyspace_route_resolver, writers: Arc::new(Mutex::new(HashMap::new())), tx, }; - + // Return the sink and receiver for testing (sink, rx) } /// Process events from channel and write to Delta Lake - async fn process_events_loop( - &self, - mut rx: mpsc::Receiver>>, - ) { + async fn process_events_loop(&self, mut rx: mpsc::Receiver>>) { while let Some(events_vec) = rx.recv().await { - if let Err(e) = self.process_events(events_vec).await { - error!("Failed to process events: {}", e); + let retry_on_failure = self.keyspace_route_resolver.is_some(); + let mut pending_events = events_vec; + + loop { + let retry_snapshot = retry_on_failure.then(|| pending_events.clone()); + match self.process_events(pending_events).await { + Ok(()) => break, + Err(error) => { + error!("Failed to process events: {}", error); + let Some(events) = retry_snapshot else { + break; + }; + tokio::time::sleep(ROUTE_RESOLUTION_RETRY_DELAY).await; + pending_events = events; + } + } } } } @@ -365,36 +394,27 @@ impl TopSQLDeltaLakeSink { if events_vec.is_empty() { return Ok(()); } - // Group events by table_name (instance_key for topsql/tikv, source_table for topru) - let mut table_events: HashMap> = HashMap::new(); + let mut table_events: HashMap> = HashMap::new(); + let mut resolved_routes: HashMap> = HashMap::new(); for events in events_vec { for event in events { if let Event::Log(log_event) = event { - let table_name: Option = log_event - .get(LABEL_INSTANCE_KEY) - .and_then(|v| v.as_str()) - .map(|s| s.to_string()) - .or_else(|| { - // TopRU events lack instance_key; use source_table as grouping key - log_event - .get(LABEL_SOURCE_TABLE) - .and_then(|v| v.as_str()) - .filter(|s| *s == SOURCE_TABLE_TOPRU) - .map(|s| s.to_string()) - }); - if let Some(name) = table_name { + if let Some(writer_key) = self + .resolve_writer_key(&log_event, &mut resolved_routes) + .await? + { table_events - .entry(name) + .entry(writer_key) .or_insert_with(Vec::new) .push(Event::Log(log_event)); } } } } - // Write table's events - for (table_name, mut events) in table_events { - self.add_schema_info(&mut events, &table_name); - if let Err(e) = self.write_table_events(&table_name, events).await { + + for (writer_key, mut events) in table_events { + self.add_schema_info(&mut events, &writer_key.table_name); + if let Err(e) = self.write_table_events(&writer_key, events).await { let error_msg = e.to_string(); if error_msg.contains("log segment") || error_msg.contains("Invalid table version") @@ -403,10 +423,15 @@ impl TopSQLDeltaLakeSink { { panic!( "Delta Lake corruption detected for table {}: {}", - table_name, error_msg + writer_key.table_name, error_msg ); } else { - error!("Failed to write events to table {}: {}", table_name, e); + error!( + "Failed to write events to table {} at {}: {}", + writer_key.table_name, + writer_key.table_path.display(), + e + ); } } } @@ -429,63 +454,148 @@ impl TopSQLDeltaLakeSink { log.insert("_schema_metadata", serde_json::Value::Object(schema)); } + fn extract_table_name(log_event: &LogEvent) -> Option { + log_event + .get(LABEL_INSTANCE_KEY) + .and_then(|value| value.as_str()) + .map(|value| value.to_string()) + .or_else(|| { + log_event + .get(LABEL_SOURCE_TABLE) + .and_then(|value| value.as_str()) + .filter(|value| *value == SOURCE_TABLE_TOPRU) + .map(|value| value.to_string()) + }) + } + + async fn resolve_writer_key( + &self, + log_event: &LogEvent, + resolved_routes: &mut HashMap>, + ) -> Result, Box> { + let Some(table_name) = Self::extract_table_name(log_event) else { + return Ok(None); + }; + let route = self + .resolve_keyspace_route(log_event, resolved_routes) + .await?; + if self.keyspace_route_resolver.is_some() && route.is_none() { + return Ok(None); + } + Ok(Some(WriterKey { + table_name: table_name.clone(), + table_path: self.build_table_path(&table_name, route.as_ref()), + })) + } + + async fn resolve_keyspace_route( + &self, + log_event: &LogEvent, + resolved_routes: &mut HashMap>, + ) -> Result, Box> { + let Some(resolver) = self.keyspace_route_resolver.as_ref() else { + return Ok(None); + }; + let Some(keyspace) = log_event + .get(LABEL_KEYSPACE) + .and_then(|value| value.as_str()) + else { + return Ok(None); + }; + + if let Some(route) = resolved_routes.get(keyspace.as_ref()) { + return Ok(route.clone()); + } + + let route = resolver.resolve_keyspace(keyspace.as_ref()).await?; + resolved_routes.insert(keyspace.to_string(), route.clone()); + if route.is_none() { + warn!( + "No cluster route found for keyspace {}, skipping TopSQL data event", + keyspace + ); + } + Ok(route) + } + + fn build_table_path(&self, table_name: &str, route: Option<&KeyspaceRoute>) -> PathBuf { + let (table_type, table_instance) = Self::table_partition_values(table_name); + + let mut segments = Vec::new(); + if let Some(route) = route { + segments.push(format!("org={}", route.org_id)); + segments.push(format!("cluster={}", route.cluster_id)); + } + segments.push(format!("type=topsql_{}", table_type)); + segments.push(format!("instance={}", table_instance)); + + let segment_refs: Vec<&str> = segments.iter().map(|segment| segment.as_str()).collect(); + Self::join_path(&self.base_path, &segment_refs) + } + + fn table_partition_values(table_name: &str) -> (&str, &str) { + if table_name == SOURCE_TABLE_TOPRU { + ("topru", "default") + } else { + match table_name + .strip_prefix("topsql_") + .and_then(|rest| rest.split_once('_')) + { + Some((table_type, table_instance)) + if !table_type.is_empty() && !table_instance.is_empty() => + { + (table_type, table_instance) + } + _ => { + error!( + "Unexpected table_name format (expected `topsql_{{type}}_{{instance}}` or `topsql_topru`): {}", + table_name + ); + ("unknown", "unknown") + } + } + } + } + + fn join_path(base_path: &PathBuf, segments: &[&str]) -> PathBuf { + if base_path.to_string_lossy().starts_with("s3://") { + let mut path = base_path + .to_string_lossy() + .trim_end_matches('/') + .to_string(); + for segment in segments { + path.push('/'); + path.push_str(segment); + } + PathBuf::from(path) + } else { + let mut path = base_path.clone(); + for segment in segments { + path = path.join(segment); + } + path + } + } + /// Write events to a specific table async fn write_table_events( &self, - table_name: &str, + writer_key: &WriterKey, events: Vec, ) -> Result<(), Box> { - // Get or create writer for this table let mut writers = self.writers.lock().await; - let writer = writers.entry(table_name.to_string()).or_insert_with(|| { - let (table_type, table_instance) = if table_name == SOURCE_TABLE_TOPRU { - ("topru", "default") - } else { - match table_name - .strip_prefix("topsql_") - .and_then(|rest| rest.split_once('_')) - { - Some((t, inst)) if !t.is_empty() && !inst.is_empty() => (t, inst), - _ => { - error!( - "Unexpected table_name format (expected `topsql_{{type}}_{{instance}}` or `topsql_topru`): {}", - table_name - ); - ("unknown", "unknown") - } - } - }; - - let type_dir = format!("component={}", table_type); - let instance_dir = format!("instance={}", table_instance); - - let table_path = if self.base_path.to_string_lossy().starts_with("s3://") { - // For S3 paths, build a partition-like directory structure - // /topsql/data/component=.../instance=.../ - let base = self.base_path.to_string_lossy(); - let base = base.trim_end_matches('/'); - PathBuf::from(format!( - "{}/{}/{}", - base, type_dir, instance_dir - )) - } else { - // For local paths, use join as before - self.base_path - .join(&type_dir) - .join(&instance_dir) - }; - + let writer = writers.entry(writer_key.clone()).or_insert_with(|| { let table_config = self .tables .iter() - .find(|t| t.name == table_name) + .find(|table| table.name == writer_key.table_name) .cloned() .unwrap_or_else(|| DeltaTableConfig { - name: table_name.to_string(), + name: writer_key.table_name.clone(), schema_evolution: Some(true), }); DeltaLakeWriter::new_with_options( - table_path, + writer_key.table_path.clone(), table_config, self.write_config.clone(), self.storage_options.clone(), @@ -537,13 +647,15 @@ impl StreamSink for TopSQLDeltaLakeSink { events_cache.push(events); // Allow max delay to configured value, continue if not ready to send - if events_count + cur_cached_size < sink.write_config.batch_size - && latest_timestamp < oldest_timestamp + sink.max_delay_secs as i64 { + if events_count + cur_cached_size < sink.write_config.batch_size + && latest_timestamp < oldest_timestamp + sink.max_delay_secs as i64 + { continue; } // Send events to process_events through channel - let should_drop_on_full = latest_timestamp >= oldest_timestamp + sink.max_delay_secs as i64; + let should_drop_on_full = + latest_timestamp >= oldest_timestamp + sink.max_delay_secs as i64; match tx.try_send(events_cache) { Ok(_) => { // Successfully sent, clear the cache @@ -570,7 +682,7 @@ impl StreamSink for TopSQLDeltaLakeSink { } } } - + // When the input stream ends, try to send any remaining cached events if !events_cache.is_empty() { // Send remaining events, wait if channel is full @@ -579,7 +691,7 @@ impl StreamSink for TopSQLDeltaLakeSink { error!("Channel closed when flushing remaining events, dropping events"); } } - + // Note: We don't drop tx here as it's owned by the sink and may be used by other run() calls // The channel will be closed when the sink is dropped Ok(()) @@ -601,7 +713,9 @@ mod tests { event } - fn create_test_sink_with_receiver(batch_size: usize) -> (TopSQLDeltaLakeSink, mpsc::Receiver>>) { + fn create_test_sink_with_receiver( + batch_size: usize, + ) -> (TopSQLDeltaLakeSink, mpsc::Receiver>>) { TopSQLDeltaLakeSink::new_for_test( PathBuf::from("/tmp/test"), vec![], @@ -611,52 +725,107 @@ mod tests { }, 180, // Use default value for tests None, + None, ) } + #[test] + fn test_build_table_path_with_meta_route_for_s3() { + let (sink, _) = TopSQLDeltaLakeSink::new_for_test( + PathBuf::from("s3://o11y-prod-shared-us-west-2-premium/deltalake"), + vec![], + WriteConfig { + batch_size: 1, + timeout_secs: 0, + }, + 180, + None, + None, + ); + + let table_path = sink.build_table_path( + "topsql_tidb_127.0.0.1:10080", + Some(&KeyspaceRoute { + org_id: "1369847559692509642".to_string(), + cluster_id: "10110362358366286743".to_string(), + }), + ); + + assert_eq!( + table_path, + PathBuf::from( + "s3://o11y-prod-shared-us-west-2-premium/deltalake/org=1369847559692509642/cluster=10110362358366286743/type=topsql_tidb/instance=127.0.0.1:10080" + ) + ); + } + + #[test] + fn test_build_table_path_without_meta_route_preserves_existing_layout() { + let (sink, _) = TopSQLDeltaLakeSink::new_for_test( + PathBuf::from("/tmp/deltalake"), + vec![], + WriteConfig { + batch_size: 1, + timeout_secs: 0, + }, + 180, + None, + None, + ); + + let table_path = sink.build_table_path("topsql_topru", None); + + assert_eq!( + table_path, + PathBuf::from("/tmp/deltalake/type=topsql_topru/instance=default") + ); + } + #[tokio::test] async fn test_send_when_batch_size_reached() { let batch_size = 5; let (sink, mut rx) = create_test_sink_with_receiver(batch_size); - + // Create events that will reach batch size let events: Vec = (0..batch_size) .map(|i| create_test_event(1000 + i as i64)) .collect(); - + let input_stream = stream::iter(events.clone()).boxed(); let sink_box = Box::new(sink); - + // Run the function in a task - let run_handle = tokio::spawn(async move { - sink_box.run(input_stream).await - }); - + let run_handle = tokio::spawn(async move { sink_box.run(input_stream).await }); + // Wait a bit for the message to be sent tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; - + // Verify that a message was sent through the channel - let received = tokio::time::timeout( - tokio::time::Duration::from_millis(500), - rx.recv() - ).await; - + let received = + tokio::time::timeout(tokio::time::Duration::from_millis(500), rx.recv()).await; + assert!(received.is_ok(), "Should receive a message from channel"); if let Ok(Some(events_vec)) = received { // Verify the message content // Count total events let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); - assert_eq!(total_events, batch_size, "Should receive exactly batch_size events"); - + assert_eq!( + total_events, batch_size, + "Should receive exactly batch_size events" + ); + // Verify event structure assert!(!events_vec.is_empty(), "Events vector should not be empty"); for event_batch in &events_vec { - assert!(!event_batch.is_empty(), "Each event batch should not be empty"); + assert!( + !event_batch.is_empty(), + "Each event batch should not be empty" + ); } } else { panic!("Failed to receive message from channel"); } - + // Wait for run to complete let _ = run_handle.await; } @@ -665,44 +834,43 @@ mod tests { async fn test_send_when_timeout_reached() { let batch_size = 100; // Large batch size so we don't reach it let (sink, mut rx) = create_test_sink_with_receiver(batch_size); - + // Create events with timestamps that exceed timeout (180 seconds) let oldest_ts = 1000; let latest_ts = oldest_ts + 181; // Exceeds 180 second timeout - + // Create two events: one at the start, one after timeout - let events = vec![ - create_test_event(oldest_ts), - create_test_event(latest_ts), - ]; - + let events = vec![create_test_event(oldest_ts), create_test_event(latest_ts)]; + let input_stream = stream::iter(events.clone()).boxed(); let sink_box = Box::new(sink); - + // Run the function in a task - let run_handle = tokio::spawn(async move { - sink_box.run(input_stream).await - }); - + let run_handle = tokio::spawn(async move { sink_box.run(input_stream).await }); + // Wait a bit for the message to be sent tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; - + // Verify that a message was sent through the channel due to timeout - let received = tokio::time::timeout( - tokio::time::Duration::from_millis(500), - rx.recv() - ).await; - - assert!(received.is_ok(), "Should receive a message from channel due to timeout"); + let received = + tokio::time::timeout(tokio::time::Duration::from_millis(500), rx.recv()).await; + + assert!( + received.is_ok(), + "Should receive a message from channel due to timeout" + ); if let Ok(Some(events_vec)) = received { // Verify the message content // Verify events were sent let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); - assert_eq!(total_events, 2, "Should receive both events (oldest and latest)"); + assert_eq!( + total_events, 2, + "Should receive both events (oldest and latest)" + ); } else { panic!("Failed to receive message from channel"); } - + // Wait for run to complete let _ = run_handle.await; } @@ -711,55 +879,60 @@ mod tests { async fn test_channel_full_keep_cache_when_not_timeout() { let batch_size = 5; let (sink, mut rx) = create_test_sink_with_receiver(batch_size); - + // Create many events to fill the channel (capacity 1) // The first batch will fill the channel, second batch should be kept in cache // and retried later let events: Vec = (0..batch_size * 2) .map(|i| create_test_event(1000 + i as i64)) // All within timeout window .collect(); - + let input_stream = stream::iter(events.clone()).boxed(); let sink_box = Box::new(sink); - + // Run the function in a task - let run_handle = tokio::spawn(async move { - sink_box.run(input_stream).await - }); - + let run_handle = tokio::spawn(async move { sink_box.run(input_stream).await }); + // Don't consume from rx immediately to fill the channel // Wait a bit for the first message to be sent // The channel should be full now, and subsequent sends should keep data in cache // Since we're not consuming, the channel stays full // After a bit more time, the run should complete tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; - + // Now consume the first message let first_msg = rx.recv().await; assert!(first_msg.is_some(), "Should receive first message"); if let Some(events_vec) = first_msg { // Verify first message content let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); - assert_eq!(total_events, batch_size, "First message should contain batch_size events"); + assert_eq!( + total_events, batch_size, + "First message should contain batch_size events" + ); } - + // Wait a bit more - the second batch should be sent after channel has space tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; - + // Check if second message was sent (data was kept in cache and retried) - let second_msg = tokio::time::timeout( - tokio::time::Duration::from_millis(200), - rx.recv() - ).await; - + let second_msg = + tokio::time::timeout(tokio::time::Duration::from_millis(200), rx.recv()).await; + // The second batch should eventually be sent (kept in cache and retried) - assert!(second_msg.is_ok(), "Should eventually receive second message after retry"); + assert!( + second_msg.is_ok(), + "Should eventually receive second message after retry" + ); if let Ok(Some(events_vec)) = second_msg { // Verify second message content let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); - assert_eq!(total_events, batch_size, "Second message should contain batch_size events"); + assert_eq!( + total_events, batch_size, + "Second message should contain batch_size events" + ); } - + // Wait for run to complete let _ = run_handle.await; } @@ -768,7 +941,7 @@ mod tests { async fn test_channel_full_drop_when_timeout() { let batch_size = 5; let (sink, mut rx) = create_test_sink_with_receiver(batch_size); - + // Create events with timeout: first batch, then events after timeout let mut events = vec![]; // First batch at timestamp 1000 @@ -780,54 +953,59 @@ mod tests { events.push(create_test_event(1005 + i as i64)); } events.push(create_test_event(1186)); - + let input_stream = stream::iter(events.clone()).boxed(); let sink_box = Box::new(sink); - + // Run the function in a task - let run_handle = tokio::spawn(async move { - sink_box.run(input_stream).await - }); - + let run_handle = tokio::spawn(async move { sink_box.run(input_stream).await }); + // Don't consume from rx to fill the channel // Wait for first message to be sent // Channel should be full now // When the timeout event arrives and channel is full, data should be dropped tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; - + // Consume the first message let first_msg = rx.recv().await; assert!(first_msg.is_some(), "Should receive first message"); if let Some(events_vec) = first_msg { // Verify first message content let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); - assert_eq!(total_events, batch_size, "First message should contain batch_size events"); - + assert_eq!( + total_events, batch_size, + "First message should contain batch_size events" + ); + // Verify timestamps are from the first batch (1000-1004) for event_batch in &events_vec { for event in event_batch { if let Event::Log(ref log_event) = event { - if let Some(timestamp) = log_event.get("timestamps").and_then(|v| v.as_integer()) { - assert!(timestamp >= 1000 && timestamp < 1000 + batch_size as i64, - "First message should contain events from first batch"); + if let Some(timestamp) = + log_event.get("timestamps").and_then(|v| v.as_integer()) + { + assert!( + timestamp >= 1000 && timestamp < 1000 + batch_size as i64, + "First message should contain events from first batch" + ); } } } } } - + // Wait a bit more - the timeout event should have been dropped, not sent tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; - + // Check if a second message was sent (it shouldn't be, as data was dropped) - let second_msg = tokio::time::timeout( - tokio::time::Duration::from_millis(200), - rx.recv() - ).await; + let second_msg = + tokio::time::timeout(tokio::time::Duration::from_millis(200), rx.recv()).await; // The second message should NOT be sent because data was dropped due to timeout - assert!(second_msg.is_err() || second_msg.unwrap().is_none(), - "Should NOT receive second message as data was dropped due to timeout"); - + assert!( + second_msg.is_err() || second_msg.unwrap().is_none(), + "Should NOT receive second message as data was dropped due to timeout" + ); + // Wait for run to complete let _ = run_handle.await; } @@ -836,41 +1014,38 @@ mod tests { async fn test_not_send_when_batch_size_and_timeout_not_reached() { let batch_size = 10; let (sink, mut rx) = create_test_sink_with_receiver(batch_size); - + // Create events that don't reach batch size and don't timeout - let events: Vec = (0..3) - .map(|i| create_test_event(1000 + i)) - .collect(); - + let events: Vec = (0..3).map(|i| create_test_event(1000 + i)).collect(); + let input_stream = stream::iter(events.clone()).boxed(); let sink_box = Box::new(sink); - + // Run the function in a task - let run_handle = tokio::spawn(async move { - sink_box.run(input_stream).await - }); - + let run_handle = tokio::spawn(async move { sink_box.run(input_stream).await }); + // Wait for run to complete let result = run_handle.await; assert!(result.is_ok()); assert!(result.unwrap().is_ok()); - + // Verify that no message was sent (data doesn't meet send conditions) // Note: When stream ends, remaining data might be flushed, but with only 3 events // and batch_size 10, and no timeout, it should not send immediately // However, when the stream ends, the loop exits and remaining cache might be sent // Let's check if any message was received - let received = tokio::time::timeout( - tokio::time::Duration::from_millis(200), - rx.recv() - ).await; - + let received = + tokio::time::timeout(tokio::time::Duration::from_millis(200), rx.recv()).await; + // With the current implementation, when stream ends, remaining cache might be sent // So we check if a message was received and verify its content if let Ok(Some(events_vec)) = received { // Verify the message content let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); - assert_eq!(total_events, 3, "Should receive the 3 events that were cached"); + assert_eq!( + total_events, 3, + "Should receive the 3 events that were cached" + ); } else { // If no message was received, that's also valid - data wasn't sent // This depends on implementation details of when remaining cache is flushed @@ -881,41 +1056,42 @@ mod tests { async fn test_batch_size_sending_behavior() { let batch_size = 3; let (sink, mut rx) = create_test_sink_with_receiver(batch_size); - + // Create exactly batch_size events let events: Vec = (0..batch_size) .map(|i| create_test_event(1000 + i as i64)) .collect(); - + let input_stream = stream::iter(events.clone()).boxed(); let sink_box = Box::new(sink); - + // Run the function in a task - let run_handle = tokio::spawn(async move { - sink_box.run(input_stream).await - }); - + let run_handle = tokio::spawn(async move { sink_box.run(input_stream).await }); + // Wait a bit for the message to be sent tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; - + // Verify that a message was sent through the channel - let received = tokio::time::timeout( - tokio::time::Duration::from_millis(500), - rx.recv() - ).await; - + let received = + tokio::time::timeout(tokio::time::Duration::from_millis(500), rx.recv()).await; + assert!(received.is_ok(), "Should receive a message from channel"); if let Ok(Some(events_vec)) = received { // Verify the message content // Count total events let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); - assert_eq!(total_events, batch_size, "Should receive exactly batch_size events"); - + assert_eq!( + total_events, batch_size, + "Should receive exactly batch_size events" + ); + // Verify event timestamps for event_batch in events_vec { for (i, event) in event_batch.iter().enumerate() { if let Event::Log(ref log_event) = event { - if let Some(timestamp) = log_event.get("timestamps").and_then(|v| v.as_integer()) { + if let Some(timestamp) = + log_event.get("timestamps").and_then(|v| v.as_integer()) + { assert_eq!(timestamp, 1000 + i as i64, "Event timestamp should match"); } } @@ -924,7 +1100,7 @@ mod tests { } else { panic!("Failed to receive message from channel"); } - + // Wait for run to complete let _ = run_handle.await; } @@ -933,56 +1109,58 @@ mod tests { async fn test_timeout_sending_behavior() { let batch_size = 100; // Large batch size let (sink, mut rx) = create_test_sink_with_receiver(batch_size); - + // Create events with large time gap (exceeding 180 seconds) let oldest_ts = 1000; let latest_ts = 1181; // 181 seconds later, exceeds timeout - let events = vec![ - create_test_event(oldest_ts), - create_test_event(latest_ts), - ]; - + let events = vec![create_test_event(oldest_ts), create_test_event(latest_ts)]; + let input_stream = stream::iter(events.clone()).boxed(); let sink_box = Box::new(sink); - + // Run the function in a task - let run_handle = tokio::spawn(async move { - sink_box.run(input_stream).await - }); - + let run_handle = tokio::spawn(async move { sink_box.run(input_stream).await }); + // Wait a bit for the message to be sent tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; - + // Verify that a message was sent through the channel due to timeout - let received = tokio::time::timeout( - tokio::time::Duration::from_millis(500), - rx.recv() - ).await; - - assert!(received.is_ok(), "Should receive a message from channel due to timeout"); + let received = + tokio::time::timeout(tokio::time::Duration::from_millis(500), rx.recv()).await; + + assert!( + received.is_ok(), + "Should receive a message from channel due to timeout" + ); if let Ok(Some(events_vec)) = received { // Verify the message content // Count total events let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); assert_eq!(total_events, 2, "Should receive both events"); - + // Verify event timestamps let mut timestamps = Vec::new(); for event_batch in &events_vec { for event in event_batch { if let Event::Log(ref log_event) = event { - if let Some(timestamp) = log_event.get("timestamps").and_then(|v| v.as_integer()) { + if let Some(timestamp) = + log_event.get("timestamps").and_then(|v| v.as_integer()) + { timestamps.push(timestamp); } } } } timestamps.sort(); - assert_eq!(timestamps, vec![oldest_ts, latest_ts], "Should receive events with correct timestamps"); + assert_eq!( + timestamps, + vec![oldest_ts, latest_ts], + "Should receive events with correct timestamps" + ); } else { panic!("Failed to receive message from channel"); } - + // Wait for run to complete let _ = run_handle.await; } @@ -991,51 +1169,51 @@ mod tests { async fn test_multiple_batches() { let batch_size = 3; let (sink, mut rx) = create_test_sink_with_receiver(batch_size); - + // Create multiple batches worth of events let total_events = batch_size * 3; let events: Vec = (0..total_events) .map(|i| create_test_event(1000 + i as i64)) .collect(); - + let input_stream = stream::iter(events.clone()).boxed(); let sink_box = Box::new(sink); - + // Run the function in a task - let run_handle = tokio::spawn(async move { - sink_box.run(input_stream).await - }); - + let run_handle = tokio::spawn(async move { sink_box.run(input_stream).await }); + // Collect all messages from the channel let mut received_messages = Vec::new(); let expected_batches = (total_events + batch_size - 1) / batch_size; // Ceiling division - + // Wait for all batches to be sent for _ in 0..expected_batches { - let received = tokio::time::timeout( - tokio::time::Duration::from_millis(500), - rx.recv() - ).await; + let received = + tokio::time::timeout(tokio::time::Duration::from_millis(500), rx.recv()).await; if let Ok(Some(msg)) = received { received_messages.push(msg); } else { break; } } - + // Verify we received the expected number of batches assert!(received_messages.len() >= 1); // Verify total events received - let total_received: usize = received_messages.iter() + let total_received: usize = received_messages + .iter() .map(|events_vec| events_vec.iter().map(|v| v.len()).sum::()) .sum(); - assert_eq!(total_received, total_events, "Should receive all events across batches"); - + assert_eq!( + total_received, total_events, + "Should receive all events across batches" + ); + // Verify each message for events_vec in &received_messages { assert!(!events_vec.is_empty(), "Each batch should contain events"); } - + // Wait for run to complete let _ = run_handle.await; } diff --git a/src/sources/keyviz.rs b/src/sources/keyviz.rs index b182041..6395951 100644 --- a/src/sources/keyviz.rs +++ b/src/sources/keyviz.rs @@ -92,6 +92,8 @@ impl SourceConfig for KeyvizConfig { // Since we already checked is_nextgen_mode() above, we know we're in legacy mode here let topo = TopologyFetcher::new( Some(pd_address.clone()), + None, // manager_server_address + None, // tidb_namespace tls.clone(), &cx.proxy, None, // tidb_group diff --git a/src/sources/system_tables/controller.rs b/src/sources/system_tables/controller.rs index 3ddc962..4293bd5 100644 --- a/src/sources/system_tables/controller.rs +++ b/src/sources/system_tables/controller.rs @@ -70,6 +70,8 @@ impl Controller { TopologyFetcher::new( Some(String::new()), None, + None, + None, proxy_config, tidb_group.clone(), label_k8s_instance.clone(), @@ -94,6 +96,8 @@ impl Controller { TopologyFetcher::new( Some(pd_addr), + None, + None, pd_tls.clone(), proxy_config, tidb_group.clone(), diff --git a/src/sources/topsql/controller.rs b/src/sources/topsql/controller.rs index d0f3cbd..f371b9c 100644 --- a/src/sources/topsql/controller.rs +++ b/src/sources/topsql/controller.rs @@ -47,6 +47,8 @@ impl Controller { pub async fn new( sharedpool_id: Option, pd_address: Option, + manager_server_address: Option, + tidb_namespace: Option, topo_fetch_interval: Duration, init_retry_delay: Duration, top_n: usize, @@ -61,6 +63,8 @@ impl Controller { ) -> vector::Result { let topo_fetcher = TopologyFetcher::new( pd_address, + manager_server_address, + tidb_namespace, tls_config.clone(), proxy_config, tidb_group, diff --git a/src/sources/topsql/mod.rs b/src/sources/topsql/mod.rs index 8b8e57d..72b837b 100644 --- a/src/sources/topsql/mod.rs +++ b/src/sources/topsql/mod.rs @@ -37,6 +37,12 @@ pub struct TopSQLConfig { /// PLACEHOLDER pub pd_address: Option, + /// PLACEHOLDER + pub manager_server_address: Option, + + /// PLACEHOLDER + pub tidb_namespace: Option, + /// PLACEHOLDER pub tls: Option, @@ -81,6 +87,8 @@ impl GenerateConfig for TopSQLConfig { label_k8s_instance: None, keyspace_to_vmtenants: None, pd_address: None, + manager_server_address: None, + tidb_namespace: None, tls: None, init_retry_delay_seconds: default_init_retry_delay(), topology_fetch_interval_seconds: default_topology_fetch_interval(), @@ -102,6 +110,8 @@ impl SourceConfig for TopSQLConfig { let label_k8s_instance = self.label_k8s_instance.clone(); let keyspace_to_vmtenants = self.keyspace_to_vmtenants.clone(); let pd_address = self.pd_address.clone(); + let manager_server_address = self.manager_server_address.clone(); + let tidb_namespace = self.tidb_namespace.clone(); let tls = self.tls.clone(); let topology_fetch_interval = Duration::from_secs_f64(self.topology_fetch_interval_seconds); let init_retry_delay = Duration::from_secs_f64(self.init_retry_delay_seconds); @@ -127,6 +137,8 @@ impl SourceConfig for TopSQLConfig { let controller = Controller::new( sharedpool_id, pd_address, + manager_server_address, + tidb_namespace, topology_fetch_interval, init_retry_delay, top_n, diff --git a/src/sources/topsql_v2/arch.md b/src/sources/topsql_v2/arch.md index 3ffd579..8cd9777 100644 --- a/src/sources/topsql_v2/arch.md +++ b/src/sources/topsql_v2/arch.md @@ -33,6 +33,7 @@ TopSQL v2 Source 2. **Improved Error Recovery**: More robust error handling and recovery 3. **Better Performance**: Optimized data collection and processing 4. **Next-gen Features**: Support for new TiDB/TiKV features +5. **Manager-based TiDB Discovery**: In legacy mode, active TiDB instances can be discovered from a manager service via `manager_server_address` and `tidb_namespace` ## Configuration @@ -45,6 +46,12 @@ pub struct TopSQLV2Config { } ``` +Legacy mode discovery options: + +- `pd_address`: used for PD/store discovery and schema management +- `manager_server_address`: optional manager endpoint used to fetch active TiDB instances +- `tidb_namespace`: manager namespace list used when calling `/api/tidb/get_active_tidb` + ## Data Flow Same as TopSQL v1 but with improved reliability and performance. diff --git a/src/sources/topsql_v2/controller.rs b/src/sources/topsql_v2/controller.rs index 7351626..e519939 100644 --- a/src/sources/topsql_v2/controller.rs +++ b/src/sources/topsql_v2/controller.rs @@ -45,6 +45,8 @@ struct ActiveSchemaManager { impl Controller { pub async fn new( pd_address: Option, + manager_server_address: Option, + tidb_namespace: Option, topo_fetch_interval: Duration, init_retry_delay: Duration, top_n: usize, @@ -59,6 +61,8 @@ impl Controller { ) -> vector::Result { let topo_fetcher = TopologyFetcher::new( pd_address, + manager_server_address, + tidb_namespace, tls_config.clone(), proxy_config, tidb_group, diff --git a/src/sources/topsql_v2/mod.rs b/src/sources/topsql_v2/mod.rs index faee452..fbac2d0 100644 --- a/src/sources/topsql_v2/mod.rs +++ b/src/sources/topsql_v2/mod.rs @@ -68,6 +68,12 @@ pub struct TopSQLConfig { /// PLACEHOLDER pub pd_address: Option, + /// PLACEHOLDER + pub manager_server_address: Option, + + /// PLACEHOLDER + pub tidb_namespace: Option, + /// PLACEHOLDER pub tls: Option, @@ -114,6 +120,8 @@ impl GenerateConfig for TopSQLConfig { tidb_group: None, label_k8s_instance: None, pd_address: None, + manager_server_address: None, + tidb_namespace: None, tls: None, init_retry_delay_seconds: default_init_retry_delay(), topology_fetch_interval_seconds: default_topology_fetch_interval(), @@ -134,6 +142,8 @@ impl SourceConfig for TopSQLConfig { let tidb_group = self.tidb_group.clone(); let label_k8s_instance = self.label_k8s_instance.clone(); let pd_address = self.pd_address.clone(); + let manager_server_address = self.manager_server_address.clone(); + let tidb_namespace = self.tidb_namespace.clone(); let tls = self.tls.clone(); let topology_fetch_interval = Duration::from_secs_f64(self.topology_fetch_interval_seconds); let init_retry_delay = Duration::from_secs_f64(self.init_retry_delay_seconds); @@ -145,6 +155,8 @@ impl SourceConfig for TopSQLConfig { Ok(Box::pin(async move { let controller = Controller::new( pd_address, + manager_server_address, + tidb_namespace, topology_fetch_interval, init_retry_delay, top_n, From 6dbf0fa2c59cf46d69cefa58284b103ff52fee81 Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Sun, 22 Mar 2026 22:14:59 +0800 Subject: [PATCH 07/26] topsql: infer topru keyspace from user --- src/sources/topsql_v2/upstream/tidb/parser.rs | 413 ++++++++++++------ 1 file changed, 286 insertions(+), 127 deletions(-) diff --git a/src/sources/topsql_v2/upstream/tidb/parser.rs b/src/sources/topsql_v2/upstream/tidb/parser.rs index 50d1aba..d8e6ef1 100644 --- a/src/sources/topsql_v2/upstream/tidb/parser.rs +++ b/src/sources/topsql_v2/upstream/tidb/parser.rs @@ -1,24 +1,25 @@ use std::collections::{BTreeMap, HashMap}; use std::sync::Arc; -use chrono::Utc; -use vector::event::Event; -use vector_lib::event::{LogEvent, Value as LogValue}; use crate::sources::topsql_v2::schema_cache::SchemaCache; use crate::sources::topsql_v2::upstream::consts::{ - LABEL_DATE, LABEL_ENCODED_NORMALIZED_PLAN, LABEL_INSTANCE_KEY, - LABEL_NORMALIZED_PLAN, LABEL_NORMALIZED_SQL, LABEL_PLAN_DIGEST, - LABEL_SQL_DIGEST, LABEL_SOURCE_TABLE, LABEL_TIMESTAMPS, LABEL_KEYSPACE, LABEL_USER, - METRIC_NAME_CPU_TIME_MS, METRIC_NAME_NETWORK_IN_BYTES, METRIC_NAME_NETWORK_OUT_BYTES, - METRIC_NAME_STMT_DURATION_COUNT, METRIC_NAME_STMT_DURATION_SUM_NS, METRIC_NAME_STMT_EXEC_COUNT, - METRIC_NAME_TOTAL_RU, METRIC_NAME_EXEC_COUNT, METRIC_NAME_EXEC_DURATION, - SOURCE_TABLE_TIDB_TOPSQL, SOURCE_TABLE_TOPSQL_PLAN_META, SOURCE_TABLE_TOPSQL_SQL_META, SOURCE_TABLE_TOPRU, + LABEL_DATE, LABEL_ENCODED_NORMALIZED_PLAN, LABEL_INSTANCE_KEY, LABEL_KEYSPACE, + LABEL_NORMALIZED_PLAN, LABEL_NORMALIZED_SQL, LABEL_PLAN_DIGEST, LABEL_SOURCE_TABLE, + LABEL_SQL_DIGEST, LABEL_TIMESTAMPS, LABEL_USER, METRIC_NAME_CPU_TIME_MS, + METRIC_NAME_EXEC_COUNT, METRIC_NAME_EXEC_DURATION, METRIC_NAME_NETWORK_IN_BYTES, + METRIC_NAME_NETWORK_OUT_BYTES, METRIC_NAME_STMT_DURATION_COUNT, + METRIC_NAME_STMT_DURATION_SUM_NS, METRIC_NAME_STMT_EXEC_COUNT, METRIC_NAME_TOTAL_RU, + SOURCE_TABLE_TIDB_TOPSQL, SOURCE_TABLE_TOPRU, SOURCE_TABLE_TOPSQL_PLAN_META, + SOURCE_TABLE_TOPSQL_SQL_META, }; use crate::sources::topsql_v2::upstream::parser::UpstreamEventParser; use crate::sources::topsql_v2::upstream::tidb::proto::top_sql_sub_response::RespOneof; use crate::sources::topsql_v2::upstream::tidb::proto::{ PlanMeta, SqlMeta, TopSqlRecord, TopSqlRecordItem, TopSqlSubResponse, }; +use chrono::Utc; +use vector::event::Event; +use vector_lib::event::{LogEvent, Value as LogValue}; pub struct TopSqlSubResponseParser; @@ -31,9 +32,7 @@ impl UpstreamEventParser for TopSqlSubResponseParser { _schema_cache: Arc, ) -> Vec { match response.resp_oneof { - Some(RespOneof::Record(record)) => { - Self::parse_tidb_record(record, instance) - } + Some(RespOneof::Record(record)) => Self::parse_tidb_record(record, instance), Some(RespOneof::SqlMeta(sql_meta)) => Self::parse_tidb_sql_meta(sql_meta), Some(RespOneof::PlanMeta(plan_meta)) => Self::parse_tidb_plan_meta(plan_meta), Some(RespOneof::RuRecord(ru_record)) => Self::parse_top_ru_record(ru_record), @@ -103,14 +102,15 @@ impl UpstreamEventParser for TopSqlSubResponseParser { let mut cpu_values: Vec = v.iter().map(|psd| psd.cpu_time_ms).collect(); cpu_values.select_nth_unstable_by(top_n, |a, b| b.cmp(a)); let cpu_threshold = cpu_values[top_n]; - + // Find top_n threshold for network bytes using partial selection - let mut network_values: Vec = v.iter() + let mut network_values: Vec = v + .iter() .map(|psd| psd.stmt_network_in_bytes + psd.stmt_network_out_bytes) .collect(); network_values.select_nth_unstable_by(top_n, |a, b| b.cmp(a)); let network_threshold = network_values[top_n]; - + // Keep records that meet either threshold let mut kept = Vec::new(); for psd in v.iter() { @@ -132,7 +132,7 @@ impl UpstreamEventParser for TopSqlSubResponseParser { others.stmt_network_out_bytes += psd.stmt_network_out_bytes; } } - + *v = kept; } @@ -215,10 +215,7 @@ impl UpstreamEventParser for TopSqlSubResponseParser { } impl TopSqlSubResponseParser { - fn parse_tidb_record( - record: TopSqlRecord, - instance: String, - ) -> Vec { + fn parse_tidb_record(record: TopSqlRecord, instance: String) -> Vec { let mut keyspace_name_str = "".to_string(); if !record.keyspace_name.is_empty() { if let Ok(ks) = String::from_utf8(record.keyspace_name.clone()) { @@ -237,8 +234,8 @@ impl TopSqlSubResponseParser { log.insert(LABEL_TIMESTAMPS, LogValue::from(item.timestamp_sec)); if date.is_empty() { date = chrono::DateTime::from_timestamp(item.timestamp_sec as i64, 0) - .map(|dt| dt.format("%Y-%m-%d").to_string()) - .unwrap_or_else(|| "1970-01-01".to_string()); + .map(|dt| dt.format("%Y-%m-%d").to_string()) + .unwrap_or_else(|| "1970-01-01".to_string()); } log.insert(LABEL_DATE, LogValue::from(date.clone())); log.insert(LABEL_INSTANCE_KEY, instance_key.clone()); @@ -299,8 +296,7 @@ impl TopSqlSubResponseParser { fn parse_tidb_plan_meta(plan_meta: PlanMeta) -> Vec { let mut events = vec![]; let plan_digest = hex::encode_upper(plan_meta.plan_digest); - let encoded_normalized_plan = - hex::encode_upper(plan_meta.encoded_normalized_plan); + let encoded_normalized_plan = hex::encode_upper(plan_meta.encoded_normalized_plan); let mut event = Event::Log(LogEvent::default()); let log = event.as_mut_log(); @@ -308,10 +304,7 @@ impl TopSqlSubResponseParser { log.insert(LABEL_SOURCE_TABLE, SOURCE_TABLE_TOPSQL_PLAN_META); log.insert(LABEL_PLAN_DIGEST, plan_digest); log.insert(LABEL_NORMALIZED_PLAN, plan_meta.normalized_plan); - log.insert( - LABEL_ENCODED_NORMALIZED_PLAN, - encoded_normalized_plan, - ); + log.insert(LABEL_ENCODED_NORMALIZED_PLAN, encoded_normalized_plan); let now = Utc::now(); log.insert(LABEL_TIMESTAMPS, LogValue::from(now.timestamp())); let date_str = now.format("%Y-%m-%d").to_string(); @@ -320,16 +313,13 @@ impl TopSqlSubResponseParser { events } - fn parse_top_ru_record(record: crate::sources::topsql_v2::upstream::tidb::proto::TopRuRecord) -> Vec { + fn parse_top_ru_record( + record: crate::sources::topsql_v2::upstream::tidb::proto::TopRuRecord, + ) -> Vec { let mut events = vec![]; let mut date = String::new(); - let mut keyspace_name_str = "".to_string(); - if !record.keyspace_name.is_empty() { - if let Ok(ks) = String::from_utf8(record.keyspace_name.clone()) { - keyspace_name_str = ks; - } - } + let keyspace_name_str = Self::extract_top_ru_keyspace(&record); for item in record.items { let mut event = Event::Log(LogEvent::default()); @@ -359,18 +349,44 @@ impl TopSqlSubResponseParser { ); log.insert(METRIC_NAME_TOTAL_RU, LogValue::from(item.total_ru)); log.insert(METRIC_NAME_EXEC_COUNT, LogValue::from(item.exec_count)); - log.insert(METRIC_NAME_EXEC_DURATION, LogValue::from(item.exec_duration)); + log.insert( + METRIC_NAME_EXEC_DURATION, + LogValue::from(item.exec_duration), + ); events.push(event.into_log()); } events } + + fn extract_top_ru_keyspace( + record: &crate::sources::topsql_v2::upstream::tidb::proto::TopRuRecord, + ) -> String { + if !record.keyspace_name.is_empty() { + if let Ok(ks) = String::from_utf8(record.keyspace_name.clone()) { + return ks; + } + } + + record + .user + .split_once('@') + .map(|(user_with_keyspace, _)| user_with_keyspace) + .unwrap_or(record.user.as_str()) + .split_once('.') + .map(|(keyspace, _)| keyspace) + .filter(|keyspace| !keyspace.is_empty()) + .unwrap_or_default() + .to_string() + } } #[cfg(test)] mod tests { use super::*; - use crate::sources::topsql_v2::upstream::tidb::proto::{TopSqlRecordItem, TopRuRecord, TopRuRecordItem}; + use crate::sources::topsql_v2::upstream::tidb::proto::{ + TopRuRecord, TopRuRecordItem, TopSqlRecordItem, + }; const MOCK_RECORDS: &'static str = include_str!("testdata/mock-records.json"); @@ -431,7 +447,7 @@ mod tests { let plan_digest = vec![4, 5, 6]; let timestamp = 1000u64; let test_keyspace_name = b"test_keyspace_2".to_vec(); - + // Create 5 records with same timestamp let items: Vec = (0..5) .map(|i| TopSqlRecordItem { @@ -445,7 +461,7 @@ mod tests { stmt_network_out_bytes: 200 + i as u64, }) .collect(); - + responses.push(TopSqlSubResponse { resp_oneof: Some(RespOneof::Record(TopSqlRecord { sql_digest: sql_digest.clone(), @@ -454,21 +470,24 @@ mod tests { keyspace_name: test_keyspace_name.clone(), })), }); - + // top_n = 10, which is greater than 5, so all should be kept let result = TopSqlSubResponseParser::keep_top_n(responses.clone(), 10); - + // Should have same number of responses (all kept) assert_eq!(result.len(), 1); if let Some(RespOneof::Record(record)) = &result[0].resp_oneof { assert_eq!(record.items.len(), 5); assert_eq!(record.sql_digest, sql_digest); assert_eq!(record.plan_digest, plan_digest); - assert_eq!(record.keyspace_name, test_keyspace_name, "keyspace_name should be preserved"); + assert_eq!( + record.keyspace_name, test_keyspace_name, + "keyspace_name should be preserved" + ); } else { panic!("Expected Record"); } - + // top_n = 5, which equals 5, so all should be kept let result2 = TopSqlSubResponseParser::keep_top_n(responses, 5); assert_eq!(result2.len(), 1); @@ -476,7 +495,10 @@ mod tests { assert_eq!(record.items.len(), 5); assert_eq!(record.sql_digest, sql_digest); assert_eq!(record.plan_digest, plan_digest); - assert_eq!(record.keyspace_name, test_keyspace_name, "keyspace_name should be preserved"); + assert_eq!( + record.keyspace_name, test_keyspace_name, + "keyspace_name should be preserved" + ); } else { panic!("Expected Record"); } @@ -491,7 +513,7 @@ mod tests { let plan_digest = vec![4, 5, 6]; let timestamp = 1000u64; let test_keyspace_name = b"test_keyspace_3".to_vec(); - + // Create 10 records with same cpu_time_ms and same network bytes let items: Vec = (0..10) .map(|_| TopSqlRecordItem { @@ -501,11 +523,11 @@ mod tests { stmt_kv_exec_count: BTreeMap::new(), stmt_duration_sum_ns: 1000, stmt_duration_count: 1, - stmt_network_in_bytes: 100, // All same + stmt_network_in_bytes: 100, // All same stmt_network_out_bytes: 200, // All same, total = 300 }) .collect(); - + responses.push(TopSqlSubResponse { resp_oneof: Some(RespOneof::Record(TopSqlRecord { sql_digest: sql_digest.clone(), @@ -514,44 +536,45 @@ mod tests { keyspace_name: test_keyspace_name.clone(), })), }); - + // top_n = 5, all values are same // New logic: threshold equals the value (top_n-th largest, which is the same value), // so no records satisfy > threshold condition, all should go to others let result = TopSqlSubResponseParser::keep_top_n(responses, 5); - + // Verify all records go to others let mut total_cpu_kept = 0u32; let mut total_network_kept = 0u64; let mut kept_count = 0; let mut total_cpu_others = 0u32; let mut total_network_others = 0u64; - + for response in result { if let Some(RespOneof::Record(record)) = response.resp_oneof { // Verify keyspace_name is preserved assert_eq!( - record.keyspace_name, - test_keyspace_name, + record.keyspace_name, test_keyspace_name, "keyspace_name should be preserved in all records" ); - + if record.sql_digest.is_empty() { // This is others for item in record.items { total_cpu_others += item.cpu_time_ms; - total_network_others += item.stmt_network_in_bytes + item.stmt_network_out_bytes; + total_network_others += + item.stmt_network_in_bytes + item.stmt_network_out_bytes; } } else { kept_count += record.items.len(); for item in record.items { total_cpu_kept += item.cpu_time_ms; - total_network_kept += item.stmt_network_in_bytes + item.stmt_network_out_bytes; + total_network_kept += + item.stmt_network_in_bytes + item.stmt_network_out_bytes; } } } } - + // New behavior: all records go to others (none satisfy > threshold when all values are same) assert_eq!(kept_count, 0); assert_eq!(total_cpu_kept, 0); @@ -568,7 +591,7 @@ mod tests { let mut responses = vec![]; let top_n = 3; let test_keyspace_name = b"test_keyspace_timestamps".to_vec(); - + // Timestamp 1000: 8 records mixing high CPU/low network, low CPU/high network, both high, both low // Expected: Keep records that meet either CPU threshold (>20) OR network threshold (>40) // Top 3 CPU: 100, 90, 80 -> threshold = 20 (4th largest) @@ -576,16 +599,16 @@ mod tests { let timestamp1 = 1000u64; let test_cases_ts1 = vec![ // (sql_id, plan_id, cpu_time_ms, network_in_bytes, network_out_bytes, reason) - (1, 1, 100, 10, 10), // High CPU (100), low network (20) -> keep (CPU > 20) - (2, 2, 90, 10, 10), // High CPU (90), low network (20) -> keep (CPU > 20) - (3, 3, 80, 10, 10), // High CPU (80), low network (20) -> keep (CPU > 20) + (1, 1, 100, 10, 10), // High CPU (100), low network (20) -> keep (CPU > 20) + (2, 2, 90, 10, 10), // High CPU (90), low network (20) -> keep (CPU > 20) + (3, 3, 80, 10, 10), // High CPU (80), low network (20) -> keep (CPU > 20) (4, 4, 10, 200, 200), // Low CPU (10), high network (400) -> keep (network > 40) (5, 5, 10, 175, 175), // Low CPU (10), high network (350) -> keep (network > 40) (6, 6, 10, 150, 150), // Low CPU (10), high network (300) -> keep (network > 40) - (7, 7, 20, 20, 20), // Low CPU (20), low network (40) -> evict (CPU == 20, network == 40) - (8, 8, 15, 15, 15), // Low CPU (15), low network (30) -> evict + (7, 7, 20, 20, 20), // Low CPU (20), low network (40) -> evict (CPU == 20, network == 40) + (8, 8, 15, 15, 15), // Low CPU (15), low network (30) -> evict ]; - + for (sql_id, plan_id, cpu_time, net_in, net_out) in test_cases_ts1.iter() { let sql_digest = vec![*sql_id]; let plan_digest = vec![*plan_id]; @@ -607,22 +630,22 @@ mod tests { })), }); } - + // Timestamp 2000: 7 records mixing different combinations // Expected: Keep records that meet either CPU threshold (>20) OR network threshold (>60) // Top 3 CPU: 100, 90, 70 -> threshold = 20 (4th largest) // Top 3 Network: 380, 360, 140 -> threshold = 60 (4th largest) let timestamp2 = 2000u64; let test_cases_ts2 = vec![ - (9, 9, 100, 10, 10), // High CPU (100), low network (20) -> keep (CPU > 20) - (10, 10, 90, 10, 10), // High CPU (90), low network (20) -> keep (CPU > 20) - (11, 11, 70, 10, 10), // High CPU (70), low network (20) -> keep (CPU > 20) + (9, 9, 100, 10, 10), // High CPU (100), low network (20) -> keep (CPU > 20) + (10, 10, 90, 10, 10), // High CPU (90), low network (20) -> keep (CPU > 20) + (11, 11, 70, 10, 10), // High CPU (70), low network (20) -> keep (CPU > 20) (12, 12, 10, 190, 190), // Low CPU (10), high network (380) -> keep (network > 60) (13, 13, 10, 180, 180), // Low CPU (10), high network (360) -> keep (network > 60) (14, 14, 10, 70, 70), // Low CPU (10), high network (140) -> keep (network > 60) - (15, 15, 20, 30, 30), // Low CPU (20), low network (60) -> evict (CPU == 20, network == 60) + (15, 15, 20, 30, 30), // Low CPU (20), low network (60) -> evict (CPU == 20, network == 60) ]; - + for (sql_id, plan_id, cpu_time, net_in, net_out) in test_cases_ts2.iter() { let sql_digest = vec![*sql_id]; let plan_digest = vec![*plan_id]; @@ -644,14 +667,11 @@ mod tests { })), }); } - + // Timestamp 3000: 2 records (both should be kept since 2 <= top_n=3) let timestamp3 = 3000u64; - let test_cases_ts3 = vec![ - (16, 16, 50, 50, 50), - (17, 17, 40, 40, 40), - ]; - + let test_cases_ts3 = vec![(16, 16, 50, 50, 50), (17, 17, 40, 40, 40)]; + for (sql_id, plan_id, cpu_time, net_in, net_out) in test_cases_ts3.iter() { let sql_digest = vec![*sql_id]; let plan_digest = vec![*plan_id]; @@ -673,26 +693,25 @@ mod tests { })), }); } - + let result = TopSqlSubResponseParser::keep_top_n(responses, top_n); - + // Group results by timestamp let mut results_by_timestamp: BTreeMap> = BTreeMap::new(); // timestamp -> [(sql_id, cpu, network), ...] let mut others_by_timestamp: BTreeMap = BTreeMap::new(); // timestamp -> (cpu, network) - + for response in result { if let Some(RespOneof::Record(record)) = response.resp_oneof { // Verify keyspace_name is preserved assert_eq!( - record.keyspace_name, - test_keyspace_name, + record.keyspace_name, test_keyspace_name, "keyspace_name should be preserved in all records" ); - + for item in record.items { let timestamp = item.timestamp_sec; let network_total = item.stmt_network_in_bytes + item.stmt_network_out_bytes; - + if record.sql_digest.is_empty() { // This is others let entry = others_by_timestamp.entry(timestamp).or_insert((0, 0)); @@ -709,7 +728,7 @@ mod tests { } } } - + // Verify timestamp 1000: should keep 6 records (3 high CPU + 3 high network), evict 2 // CPU threshold = 20 (4th largest), keep records with CPU > 20 // Network threshold = 40 (4th largest), keep records with network > 40 @@ -717,19 +736,47 @@ mod tests { .get(×tamp1) .map(|records| records.iter().map(|r| r.0).collect()) .unwrap_or_default(); - assert_eq!(ts1_kept.len(), 6, "Timestamp 1000 should keep 6 records (3 high CPU + 3 high network)"); + assert_eq!( + ts1_kept.len(), + 6, + "Timestamp 1000 should keep 6 records (3 high CPU + 3 high network)" + ); // High CPU records (1, 2, 3) should be kept - assert!(ts1_kept.contains(&1), "Timestamp 1000 should keep sql_id 1 (high CPU)"); - assert!(ts1_kept.contains(&2), "Timestamp 1000 should keep sql_id 2 (high CPU)"); - assert!(ts1_kept.contains(&3), "Timestamp 1000 should keep sql_id 3 (high CPU)"); + assert!( + ts1_kept.contains(&1), + "Timestamp 1000 should keep sql_id 1 (high CPU)" + ); + assert!( + ts1_kept.contains(&2), + "Timestamp 1000 should keep sql_id 2 (high CPU)" + ); + assert!( + ts1_kept.contains(&3), + "Timestamp 1000 should keep sql_id 3 (high CPU)" + ); // High network records (4, 5, 6) should be kept - assert!(ts1_kept.contains(&4), "Timestamp 1000 should keep sql_id 4 (high network)"); - assert!(ts1_kept.contains(&5), "Timestamp 1000 should keep sql_id 5 (high network)"); - assert!(ts1_kept.contains(&6), "Timestamp 1000 should keep sql_id 6 (high network)"); + assert!( + ts1_kept.contains(&4), + "Timestamp 1000 should keep sql_id 4 (high network)" + ); + assert!( + ts1_kept.contains(&5), + "Timestamp 1000 should keep sql_id 5 (high network)" + ); + assert!( + ts1_kept.contains(&6), + "Timestamp 1000 should keep sql_id 6 (high network)" + ); // Low both records (7, 8) should be evicted - assert!(!ts1_kept.contains(&7), "Timestamp 1000 should NOT keep sql_id 7 (low both)"); - assert!(!ts1_kept.contains(&8), "Timestamp 1000 should NOT keep sql_id 8 (low both)"); - + assert!( + !ts1_kept.contains(&7), + "Timestamp 1000 should NOT keep sql_id 7 (low both)" + ); + assert!( + !ts1_kept.contains(&8), + "Timestamp 1000 should NOT keep sql_id 8 (low both)" + ); + // Verify kept records meet at least one threshold if let Some(records) = results_by_timestamp.get(×tamp1) { let cpu_threshold = 20u32; @@ -744,14 +791,22 @@ mod tests { ); } } - + if let Some((others_cpu, others_network)) = others_by_timestamp.get(×tamp1) { - assert_eq!(*others_cpu, 20 + 15, "Timestamp 1000 others CPU should be 35 (20+15)"); - assert_eq!(*others_network, 40 + 30, "Timestamp 1000 others network should be 70 (40+30)"); + assert_eq!( + *others_cpu, + 20 + 15, + "Timestamp 1000 others CPU should be 35 (20+15)" + ); + assert_eq!( + *others_network, + 40 + 30, + "Timestamp 1000 others network should be 70 (40+30)" + ); } else { panic!("Timestamp 1000 should have others records"); } - + // Verify timestamp 2000: should keep 6 records (3 high CPU + 3 high network), evict 1 // CPU threshold = 20 (4th largest), keep records with CPU > 20 // Network threshold = 60 (4th largest), keep records with network > 60 @@ -759,18 +814,43 @@ mod tests { .get(×tamp2) .map(|records| records.iter().map(|r| r.0).collect()) .unwrap_or_default(); - assert_eq!(ts2_kept.len(), 6, "Timestamp 2000 should keep 6 records (3 high CPU + 3 high network)"); + assert_eq!( + ts2_kept.len(), + 6, + "Timestamp 2000 should keep 6 records (3 high CPU + 3 high network)" + ); // High CPU records (9, 10, 11) should be kept - assert!(ts2_kept.contains(&9), "Timestamp 2000 should keep sql_id 9 (high CPU)"); - assert!(ts2_kept.contains(&10), "Timestamp 2000 should keep sql_id 10 (high CPU)"); - assert!(ts2_kept.contains(&11), "Timestamp 2000 should keep sql_id 11 (high CPU)"); + assert!( + ts2_kept.contains(&9), + "Timestamp 2000 should keep sql_id 9 (high CPU)" + ); + assert!( + ts2_kept.contains(&10), + "Timestamp 2000 should keep sql_id 10 (high CPU)" + ); + assert!( + ts2_kept.contains(&11), + "Timestamp 2000 should keep sql_id 11 (high CPU)" + ); // High network records (12, 13, 14) should be kept - assert!(ts2_kept.contains(&12), "Timestamp 2000 should keep sql_id 12 (high network)"); - assert!(ts2_kept.contains(&13), "Timestamp 2000 should keep sql_id 13 (high network)"); - assert!(ts2_kept.contains(&14), "Timestamp 2000 should keep sql_id 14 (high network)"); + assert!( + ts2_kept.contains(&12), + "Timestamp 2000 should keep sql_id 12 (high network)" + ); + assert!( + ts2_kept.contains(&13), + "Timestamp 2000 should keep sql_id 13 (high network)" + ); + assert!( + ts2_kept.contains(&14), + "Timestamp 2000 should keep sql_id 14 (high network)" + ); // Low both record (15) should be evicted - assert!(!ts2_kept.contains(&15), "Timestamp 2000 should NOT keep sql_id 15 (low both)"); - + assert!( + !ts2_kept.contains(&15), + "Timestamp 2000 should NOT keep sql_id 15 (low both)" + ); + // Verify kept records meet at least one threshold if let Some(records) = results_by_timestamp.get(×tamp2) { let cpu_threshold = 20u32; @@ -785,28 +865,47 @@ mod tests { ); } } - + if let Some((others_cpu, others_network)) = others_by_timestamp.get(×tamp2) { assert_eq!(*others_cpu, 20, "Timestamp 2000 others CPU should be 20"); - assert_eq!(*others_network, 60, "Timestamp 2000 others network should be 60 (30+30)"); + assert_eq!( + *others_network, 60, + "Timestamp 2000 others network should be 60 (30+30)" + ); } else { panic!("Timestamp 2000 should have others records"); } - + // Verify timestamp 3000: should keep all 2 records (2 <= top_n=3) let ts3_kept: Vec = results_by_timestamp .get(×tamp3) .map(|records| records.iter().map(|r| r.0).collect()) .unwrap_or_default(); - assert_eq!(ts3_kept.len(), 2, "Timestamp 3000 should keep all 2 records"); - assert!(ts3_kept.contains(&16), "Timestamp 3000 should keep sql_id 16"); - assert!(ts3_kept.contains(&17), "Timestamp 3000 should keep sql_id 17"); - + assert_eq!( + ts3_kept.len(), + 2, + "Timestamp 3000 should keep all 2 records" + ); + assert!( + ts3_kept.contains(&16), + "Timestamp 3000 should keep sql_id 16" + ); + assert!( + ts3_kept.contains(&17), + "Timestamp 3000 should keep sql_id 17" + ); + // Timestamp 3000 should not have others since all records are kept - assert!(!others_by_timestamp.contains_key(×tamp3), "Timestamp 3000 should not have others"); - + assert!( + !others_by_timestamp.contains_key(×tamp3), + "Timestamp 3000 should not have others" + ); + // Verify total counts - let total_kept: usize = results_by_timestamp.values().map(|records| records.len()).sum(); + let total_kept: usize = results_by_timestamp + .values() + .map(|records| records.len()) + .sum(); assert_eq!(total_kept, 14, "Total kept records should be 14 (6+6+2)"); } @@ -877,7 +976,10 @@ mod tests { assert_eq!(sum_old.stmt_duration_count, sum_new.stmt_duration_count); assert_eq!(sum_old.stmt_duration_sum_ns, sum_new.stmt_duration_sum_ns); assert_eq!(sum_old.stmt_network_in_bytes, sum_new.stmt_network_in_bytes); - assert_eq!(sum_old.stmt_network_out_bytes, sum_new.stmt_network_out_bytes); + assert_eq!( + sum_old.stmt_network_out_bytes, + sum_new.stmt_network_out_bytes + ); } #[test] @@ -909,25 +1011,82 @@ mod tests { // Check first event let event1 = &events[0]; let log1 = event1; - assert_eq!(log1.get(LABEL_SOURCE_TABLE), Some(&LogValue::from(SOURCE_TABLE_TOPRU))); - assert_eq!(log1.get(LABEL_TIMESTAMPS), Some(&LogValue::from(1709646900))); + assert_eq!( + log1.get(LABEL_SOURCE_TABLE), + Some(&LogValue::from(SOURCE_TABLE_TOPRU)) + ); + assert_eq!( + log1.get(LABEL_TIMESTAMPS), + Some(&LogValue::from(1709646900)) + ); assert_eq!(log1.get(LABEL_DATE), Some(&LogValue::from("2024-03-05"))); - assert_eq!(log1.get(LABEL_KEYSPACE), Some(&LogValue::from("test_keyspace"))); + assert_eq!( + log1.get(LABEL_KEYSPACE), + Some(&LogValue::from("test_keyspace")) + ); assert_eq!(log1.get(LABEL_USER), Some(&LogValue::from("test_user"))); - assert_eq!(log1.get(LABEL_SQL_DIGEST), Some(&LogValue::from("73716C5F6469676573745F313233"))); - assert_eq!(log1.get(LABEL_PLAN_DIGEST), Some(&LogValue::from("706C616E5F6469676573745F343536"))); + assert_eq!( + log1.get(LABEL_SQL_DIGEST), + Some(&LogValue::from("73716C5F6469676573745F313233")) + ); + assert_eq!( + log1.get(LABEL_PLAN_DIGEST), + Some(&LogValue::from("706C616E5F6469676573745F343536")) + ); assert_eq!(log1.get(METRIC_NAME_TOTAL_RU), Some(&LogValue::from(100.5))); assert_eq!(log1.get(METRIC_NAME_EXEC_COUNT), Some(&LogValue::from(10))); - assert_eq!(log1.get(METRIC_NAME_EXEC_DURATION), Some(&LogValue::from(50000000))); + assert_eq!( + log1.get(METRIC_NAME_EXEC_DURATION), + Some(&LogValue::from(50000000)) + ); // Check second event let event2 = &events[1]; let log2 = event2; - assert_eq!(log2.get(LABEL_SOURCE_TABLE), Some(&LogValue::from(SOURCE_TABLE_TOPRU))); - assert_eq!(log2.get(LABEL_TIMESTAMPS), Some(&LogValue::from(1709646960))); + assert_eq!( + log2.get(LABEL_SOURCE_TABLE), + Some(&LogValue::from(SOURCE_TABLE_TOPRU)) + ); + assert_eq!( + log2.get(LABEL_TIMESTAMPS), + Some(&LogValue::from(1709646960)) + ); assert_eq!(log2.get(LABEL_DATE), Some(&LogValue::from("2024-03-05"))); assert_eq!(log2.get(METRIC_NAME_TOTAL_RU), Some(&LogValue::from(200.0))); assert_eq!(log2.get(METRIC_NAME_EXEC_COUNT), Some(&LogValue::from(20))); - assert_eq!(log2.get(METRIC_NAME_EXEC_DURATION), Some(&LogValue::from(100000000))); + assert_eq!( + log2.get(METRIC_NAME_EXEC_DURATION), + Some(&LogValue::from(100000000)) + ); + } + + #[test] + fn test_parse_top_ru_record_falls_back_to_user_prefix_for_keyspace() { + let ru_record = TopRuRecord { + keyspace_name: vec![], + user: "test_keyspace.root@%".to_string(), + sql_digest: b"sql_digest_123".to_vec(), + plan_digest: b"plan_digest_456".to_vec(), + items: vec![TopRuRecordItem { + timestamp_sec: 1709646900, + total_ru: 54.4, + exec_count: 5, + exec_duration: 123456789, + }], + }; + + let events = TopSqlSubResponseParser::parse_top_ru_record(ru_record); + assert_eq!(events.len(), 1); + + let log = &events[0]; + assert_eq!( + log.get(LABEL_KEYSPACE), + Some(&LogValue::from("test_keyspace")) + ); + assert_eq!( + log.get(LABEL_USER), + Some(&LogValue::from("test_keyspace.root@%")) + ); + assert_eq!(log.get(METRIC_NAME_TOTAL_RU), Some(&LogValue::from(54.4))); } } From b7cdb9c4b609b9807dd67eaa7cdc2d64b7d73cd8 Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Sun, 22 Mar 2026 22:30:21 +0800 Subject: [PATCH 08/26] Revert "topsql: infer topru keyspace from user" This reverts commit 6dbf0fa2c59cf46d69cefa58284b103ff52fee81. --- src/sources/topsql_v2/upstream/tidb/parser.rs | 413 ++++++------------ 1 file changed, 127 insertions(+), 286 deletions(-) diff --git a/src/sources/topsql_v2/upstream/tidb/parser.rs b/src/sources/topsql_v2/upstream/tidb/parser.rs index d8e6ef1..50d1aba 100644 --- a/src/sources/topsql_v2/upstream/tidb/parser.rs +++ b/src/sources/topsql_v2/upstream/tidb/parser.rs @@ -1,25 +1,24 @@ use std::collections::{BTreeMap, HashMap}; use std::sync::Arc; +use chrono::Utc; +use vector::event::Event; +use vector_lib::event::{LogEvent, Value as LogValue}; use crate::sources::topsql_v2::schema_cache::SchemaCache; use crate::sources::topsql_v2::upstream::consts::{ - LABEL_DATE, LABEL_ENCODED_NORMALIZED_PLAN, LABEL_INSTANCE_KEY, LABEL_KEYSPACE, - LABEL_NORMALIZED_PLAN, LABEL_NORMALIZED_SQL, LABEL_PLAN_DIGEST, LABEL_SOURCE_TABLE, - LABEL_SQL_DIGEST, LABEL_TIMESTAMPS, LABEL_USER, METRIC_NAME_CPU_TIME_MS, - METRIC_NAME_EXEC_COUNT, METRIC_NAME_EXEC_DURATION, METRIC_NAME_NETWORK_IN_BYTES, - METRIC_NAME_NETWORK_OUT_BYTES, METRIC_NAME_STMT_DURATION_COUNT, - METRIC_NAME_STMT_DURATION_SUM_NS, METRIC_NAME_STMT_EXEC_COUNT, METRIC_NAME_TOTAL_RU, - SOURCE_TABLE_TIDB_TOPSQL, SOURCE_TABLE_TOPRU, SOURCE_TABLE_TOPSQL_PLAN_META, - SOURCE_TABLE_TOPSQL_SQL_META, + LABEL_DATE, LABEL_ENCODED_NORMALIZED_PLAN, LABEL_INSTANCE_KEY, + LABEL_NORMALIZED_PLAN, LABEL_NORMALIZED_SQL, LABEL_PLAN_DIGEST, + LABEL_SQL_DIGEST, LABEL_SOURCE_TABLE, LABEL_TIMESTAMPS, LABEL_KEYSPACE, LABEL_USER, + METRIC_NAME_CPU_TIME_MS, METRIC_NAME_NETWORK_IN_BYTES, METRIC_NAME_NETWORK_OUT_BYTES, + METRIC_NAME_STMT_DURATION_COUNT, METRIC_NAME_STMT_DURATION_SUM_NS, METRIC_NAME_STMT_EXEC_COUNT, + METRIC_NAME_TOTAL_RU, METRIC_NAME_EXEC_COUNT, METRIC_NAME_EXEC_DURATION, + SOURCE_TABLE_TIDB_TOPSQL, SOURCE_TABLE_TOPSQL_PLAN_META, SOURCE_TABLE_TOPSQL_SQL_META, SOURCE_TABLE_TOPRU, }; use crate::sources::topsql_v2::upstream::parser::UpstreamEventParser; use crate::sources::topsql_v2::upstream::tidb::proto::top_sql_sub_response::RespOneof; use crate::sources::topsql_v2::upstream::tidb::proto::{ PlanMeta, SqlMeta, TopSqlRecord, TopSqlRecordItem, TopSqlSubResponse, }; -use chrono::Utc; -use vector::event::Event; -use vector_lib::event::{LogEvent, Value as LogValue}; pub struct TopSqlSubResponseParser; @@ -32,7 +31,9 @@ impl UpstreamEventParser for TopSqlSubResponseParser { _schema_cache: Arc, ) -> Vec { match response.resp_oneof { - Some(RespOneof::Record(record)) => Self::parse_tidb_record(record, instance), + Some(RespOneof::Record(record)) => { + Self::parse_tidb_record(record, instance) + } Some(RespOneof::SqlMeta(sql_meta)) => Self::parse_tidb_sql_meta(sql_meta), Some(RespOneof::PlanMeta(plan_meta)) => Self::parse_tidb_plan_meta(plan_meta), Some(RespOneof::RuRecord(ru_record)) => Self::parse_top_ru_record(ru_record), @@ -102,15 +103,14 @@ impl UpstreamEventParser for TopSqlSubResponseParser { let mut cpu_values: Vec = v.iter().map(|psd| psd.cpu_time_ms).collect(); cpu_values.select_nth_unstable_by(top_n, |a, b| b.cmp(a)); let cpu_threshold = cpu_values[top_n]; - + // Find top_n threshold for network bytes using partial selection - let mut network_values: Vec = v - .iter() + let mut network_values: Vec = v.iter() .map(|psd| psd.stmt_network_in_bytes + psd.stmt_network_out_bytes) .collect(); network_values.select_nth_unstable_by(top_n, |a, b| b.cmp(a)); let network_threshold = network_values[top_n]; - + // Keep records that meet either threshold let mut kept = Vec::new(); for psd in v.iter() { @@ -132,7 +132,7 @@ impl UpstreamEventParser for TopSqlSubResponseParser { others.stmt_network_out_bytes += psd.stmt_network_out_bytes; } } - + *v = kept; } @@ -215,7 +215,10 @@ impl UpstreamEventParser for TopSqlSubResponseParser { } impl TopSqlSubResponseParser { - fn parse_tidb_record(record: TopSqlRecord, instance: String) -> Vec { + fn parse_tidb_record( + record: TopSqlRecord, + instance: String, + ) -> Vec { let mut keyspace_name_str = "".to_string(); if !record.keyspace_name.is_empty() { if let Ok(ks) = String::from_utf8(record.keyspace_name.clone()) { @@ -234,8 +237,8 @@ impl TopSqlSubResponseParser { log.insert(LABEL_TIMESTAMPS, LogValue::from(item.timestamp_sec)); if date.is_empty() { date = chrono::DateTime::from_timestamp(item.timestamp_sec as i64, 0) - .map(|dt| dt.format("%Y-%m-%d").to_string()) - .unwrap_or_else(|| "1970-01-01".to_string()); + .map(|dt| dt.format("%Y-%m-%d").to_string()) + .unwrap_or_else(|| "1970-01-01".to_string()); } log.insert(LABEL_DATE, LogValue::from(date.clone())); log.insert(LABEL_INSTANCE_KEY, instance_key.clone()); @@ -296,7 +299,8 @@ impl TopSqlSubResponseParser { fn parse_tidb_plan_meta(plan_meta: PlanMeta) -> Vec { let mut events = vec![]; let plan_digest = hex::encode_upper(plan_meta.plan_digest); - let encoded_normalized_plan = hex::encode_upper(plan_meta.encoded_normalized_plan); + let encoded_normalized_plan = + hex::encode_upper(plan_meta.encoded_normalized_plan); let mut event = Event::Log(LogEvent::default()); let log = event.as_mut_log(); @@ -304,7 +308,10 @@ impl TopSqlSubResponseParser { log.insert(LABEL_SOURCE_TABLE, SOURCE_TABLE_TOPSQL_PLAN_META); log.insert(LABEL_PLAN_DIGEST, plan_digest); log.insert(LABEL_NORMALIZED_PLAN, plan_meta.normalized_plan); - log.insert(LABEL_ENCODED_NORMALIZED_PLAN, encoded_normalized_plan); + log.insert( + LABEL_ENCODED_NORMALIZED_PLAN, + encoded_normalized_plan, + ); let now = Utc::now(); log.insert(LABEL_TIMESTAMPS, LogValue::from(now.timestamp())); let date_str = now.format("%Y-%m-%d").to_string(); @@ -313,13 +320,16 @@ impl TopSqlSubResponseParser { events } - fn parse_top_ru_record( - record: crate::sources::topsql_v2::upstream::tidb::proto::TopRuRecord, - ) -> Vec { + fn parse_top_ru_record(record: crate::sources::topsql_v2::upstream::tidb::proto::TopRuRecord) -> Vec { let mut events = vec![]; let mut date = String::new(); - let keyspace_name_str = Self::extract_top_ru_keyspace(&record); + let mut keyspace_name_str = "".to_string(); + if !record.keyspace_name.is_empty() { + if let Ok(ks) = String::from_utf8(record.keyspace_name.clone()) { + keyspace_name_str = ks; + } + } for item in record.items { let mut event = Event::Log(LogEvent::default()); @@ -349,44 +359,18 @@ impl TopSqlSubResponseParser { ); log.insert(METRIC_NAME_TOTAL_RU, LogValue::from(item.total_ru)); log.insert(METRIC_NAME_EXEC_COUNT, LogValue::from(item.exec_count)); - log.insert( - METRIC_NAME_EXEC_DURATION, - LogValue::from(item.exec_duration), - ); + log.insert(METRIC_NAME_EXEC_DURATION, LogValue::from(item.exec_duration)); events.push(event.into_log()); } events } - - fn extract_top_ru_keyspace( - record: &crate::sources::topsql_v2::upstream::tidb::proto::TopRuRecord, - ) -> String { - if !record.keyspace_name.is_empty() { - if let Ok(ks) = String::from_utf8(record.keyspace_name.clone()) { - return ks; - } - } - - record - .user - .split_once('@') - .map(|(user_with_keyspace, _)| user_with_keyspace) - .unwrap_or(record.user.as_str()) - .split_once('.') - .map(|(keyspace, _)| keyspace) - .filter(|keyspace| !keyspace.is_empty()) - .unwrap_or_default() - .to_string() - } } #[cfg(test)] mod tests { use super::*; - use crate::sources::topsql_v2::upstream::tidb::proto::{ - TopRuRecord, TopRuRecordItem, TopSqlRecordItem, - }; + use crate::sources::topsql_v2::upstream::tidb::proto::{TopSqlRecordItem, TopRuRecord, TopRuRecordItem}; const MOCK_RECORDS: &'static str = include_str!("testdata/mock-records.json"); @@ -447,7 +431,7 @@ mod tests { let plan_digest = vec![4, 5, 6]; let timestamp = 1000u64; let test_keyspace_name = b"test_keyspace_2".to_vec(); - + // Create 5 records with same timestamp let items: Vec = (0..5) .map(|i| TopSqlRecordItem { @@ -461,7 +445,7 @@ mod tests { stmt_network_out_bytes: 200 + i as u64, }) .collect(); - + responses.push(TopSqlSubResponse { resp_oneof: Some(RespOneof::Record(TopSqlRecord { sql_digest: sql_digest.clone(), @@ -470,24 +454,21 @@ mod tests { keyspace_name: test_keyspace_name.clone(), })), }); - + // top_n = 10, which is greater than 5, so all should be kept let result = TopSqlSubResponseParser::keep_top_n(responses.clone(), 10); - + // Should have same number of responses (all kept) assert_eq!(result.len(), 1); if let Some(RespOneof::Record(record)) = &result[0].resp_oneof { assert_eq!(record.items.len(), 5); assert_eq!(record.sql_digest, sql_digest); assert_eq!(record.plan_digest, plan_digest); - assert_eq!( - record.keyspace_name, test_keyspace_name, - "keyspace_name should be preserved" - ); + assert_eq!(record.keyspace_name, test_keyspace_name, "keyspace_name should be preserved"); } else { panic!("Expected Record"); } - + // top_n = 5, which equals 5, so all should be kept let result2 = TopSqlSubResponseParser::keep_top_n(responses, 5); assert_eq!(result2.len(), 1); @@ -495,10 +476,7 @@ mod tests { assert_eq!(record.items.len(), 5); assert_eq!(record.sql_digest, sql_digest); assert_eq!(record.plan_digest, plan_digest); - assert_eq!( - record.keyspace_name, test_keyspace_name, - "keyspace_name should be preserved" - ); + assert_eq!(record.keyspace_name, test_keyspace_name, "keyspace_name should be preserved"); } else { panic!("Expected Record"); } @@ -513,7 +491,7 @@ mod tests { let plan_digest = vec![4, 5, 6]; let timestamp = 1000u64; let test_keyspace_name = b"test_keyspace_3".to_vec(); - + // Create 10 records with same cpu_time_ms and same network bytes let items: Vec = (0..10) .map(|_| TopSqlRecordItem { @@ -523,11 +501,11 @@ mod tests { stmt_kv_exec_count: BTreeMap::new(), stmt_duration_sum_ns: 1000, stmt_duration_count: 1, - stmt_network_in_bytes: 100, // All same + stmt_network_in_bytes: 100, // All same stmt_network_out_bytes: 200, // All same, total = 300 }) .collect(); - + responses.push(TopSqlSubResponse { resp_oneof: Some(RespOneof::Record(TopSqlRecord { sql_digest: sql_digest.clone(), @@ -536,45 +514,44 @@ mod tests { keyspace_name: test_keyspace_name.clone(), })), }); - + // top_n = 5, all values are same // New logic: threshold equals the value (top_n-th largest, which is the same value), // so no records satisfy > threshold condition, all should go to others let result = TopSqlSubResponseParser::keep_top_n(responses, 5); - + // Verify all records go to others let mut total_cpu_kept = 0u32; let mut total_network_kept = 0u64; let mut kept_count = 0; let mut total_cpu_others = 0u32; let mut total_network_others = 0u64; - + for response in result { if let Some(RespOneof::Record(record)) = response.resp_oneof { // Verify keyspace_name is preserved assert_eq!( - record.keyspace_name, test_keyspace_name, + record.keyspace_name, + test_keyspace_name, "keyspace_name should be preserved in all records" ); - + if record.sql_digest.is_empty() { // This is others for item in record.items { total_cpu_others += item.cpu_time_ms; - total_network_others += - item.stmt_network_in_bytes + item.stmt_network_out_bytes; + total_network_others += item.stmt_network_in_bytes + item.stmt_network_out_bytes; } } else { kept_count += record.items.len(); for item in record.items { total_cpu_kept += item.cpu_time_ms; - total_network_kept += - item.stmt_network_in_bytes + item.stmt_network_out_bytes; + total_network_kept += item.stmt_network_in_bytes + item.stmt_network_out_bytes; } } } } - + // New behavior: all records go to others (none satisfy > threshold when all values are same) assert_eq!(kept_count, 0); assert_eq!(total_cpu_kept, 0); @@ -591,7 +568,7 @@ mod tests { let mut responses = vec![]; let top_n = 3; let test_keyspace_name = b"test_keyspace_timestamps".to_vec(); - + // Timestamp 1000: 8 records mixing high CPU/low network, low CPU/high network, both high, both low // Expected: Keep records that meet either CPU threshold (>20) OR network threshold (>40) // Top 3 CPU: 100, 90, 80 -> threshold = 20 (4th largest) @@ -599,16 +576,16 @@ mod tests { let timestamp1 = 1000u64; let test_cases_ts1 = vec![ // (sql_id, plan_id, cpu_time_ms, network_in_bytes, network_out_bytes, reason) - (1, 1, 100, 10, 10), // High CPU (100), low network (20) -> keep (CPU > 20) - (2, 2, 90, 10, 10), // High CPU (90), low network (20) -> keep (CPU > 20) - (3, 3, 80, 10, 10), // High CPU (80), low network (20) -> keep (CPU > 20) + (1, 1, 100, 10, 10), // High CPU (100), low network (20) -> keep (CPU > 20) + (2, 2, 90, 10, 10), // High CPU (90), low network (20) -> keep (CPU > 20) + (3, 3, 80, 10, 10), // High CPU (80), low network (20) -> keep (CPU > 20) (4, 4, 10, 200, 200), // Low CPU (10), high network (400) -> keep (network > 40) (5, 5, 10, 175, 175), // Low CPU (10), high network (350) -> keep (network > 40) (6, 6, 10, 150, 150), // Low CPU (10), high network (300) -> keep (network > 40) - (7, 7, 20, 20, 20), // Low CPU (20), low network (40) -> evict (CPU == 20, network == 40) - (8, 8, 15, 15, 15), // Low CPU (15), low network (30) -> evict + (7, 7, 20, 20, 20), // Low CPU (20), low network (40) -> evict (CPU == 20, network == 40) + (8, 8, 15, 15, 15), // Low CPU (15), low network (30) -> evict ]; - + for (sql_id, plan_id, cpu_time, net_in, net_out) in test_cases_ts1.iter() { let sql_digest = vec![*sql_id]; let plan_digest = vec![*plan_id]; @@ -630,22 +607,22 @@ mod tests { })), }); } - + // Timestamp 2000: 7 records mixing different combinations // Expected: Keep records that meet either CPU threshold (>20) OR network threshold (>60) // Top 3 CPU: 100, 90, 70 -> threshold = 20 (4th largest) // Top 3 Network: 380, 360, 140 -> threshold = 60 (4th largest) let timestamp2 = 2000u64; let test_cases_ts2 = vec![ - (9, 9, 100, 10, 10), // High CPU (100), low network (20) -> keep (CPU > 20) - (10, 10, 90, 10, 10), // High CPU (90), low network (20) -> keep (CPU > 20) - (11, 11, 70, 10, 10), // High CPU (70), low network (20) -> keep (CPU > 20) + (9, 9, 100, 10, 10), // High CPU (100), low network (20) -> keep (CPU > 20) + (10, 10, 90, 10, 10), // High CPU (90), low network (20) -> keep (CPU > 20) + (11, 11, 70, 10, 10), // High CPU (70), low network (20) -> keep (CPU > 20) (12, 12, 10, 190, 190), // Low CPU (10), high network (380) -> keep (network > 60) (13, 13, 10, 180, 180), // Low CPU (10), high network (360) -> keep (network > 60) (14, 14, 10, 70, 70), // Low CPU (10), high network (140) -> keep (network > 60) - (15, 15, 20, 30, 30), // Low CPU (20), low network (60) -> evict (CPU == 20, network == 60) + (15, 15, 20, 30, 30), // Low CPU (20), low network (60) -> evict (CPU == 20, network == 60) ]; - + for (sql_id, plan_id, cpu_time, net_in, net_out) in test_cases_ts2.iter() { let sql_digest = vec![*sql_id]; let plan_digest = vec![*plan_id]; @@ -667,11 +644,14 @@ mod tests { })), }); } - + // Timestamp 3000: 2 records (both should be kept since 2 <= top_n=3) let timestamp3 = 3000u64; - let test_cases_ts3 = vec![(16, 16, 50, 50, 50), (17, 17, 40, 40, 40)]; - + let test_cases_ts3 = vec![ + (16, 16, 50, 50, 50), + (17, 17, 40, 40, 40), + ]; + for (sql_id, plan_id, cpu_time, net_in, net_out) in test_cases_ts3.iter() { let sql_digest = vec![*sql_id]; let plan_digest = vec![*plan_id]; @@ -693,25 +673,26 @@ mod tests { })), }); } - + let result = TopSqlSubResponseParser::keep_top_n(responses, top_n); - + // Group results by timestamp let mut results_by_timestamp: BTreeMap> = BTreeMap::new(); // timestamp -> [(sql_id, cpu, network), ...] let mut others_by_timestamp: BTreeMap = BTreeMap::new(); // timestamp -> (cpu, network) - + for response in result { if let Some(RespOneof::Record(record)) = response.resp_oneof { // Verify keyspace_name is preserved assert_eq!( - record.keyspace_name, test_keyspace_name, + record.keyspace_name, + test_keyspace_name, "keyspace_name should be preserved in all records" ); - + for item in record.items { let timestamp = item.timestamp_sec; let network_total = item.stmt_network_in_bytes + item.stmt_network_out_bytes; - + if record.sql_digest.is_empty() { // This is others let entry = others_by_timestamp.entry(timestamp).or_insert((0, 0)); @@ -728,7 +709,7 @@ mod tests { } } } - + // Verify timestamp 1000: should keep 6 records (3 high CPU + 3 high network), evict 2 // CPU threshold = 20 (4th largest), keep records with CPU > 20 // Network threshold = 40 (4th largest), keep records with network > 40 @@ -736,47 +717,19 @@ mod tests { .get(×tamp1) .map(|records| records.iter().map(|r| r.0).collect()) .unwrap_or_default(); - assert_eq!( - ts1_kept.len(), - 6, - "Timestamp 1000 should keep 6 records (3 high CPU + 3 high network)" - ); + assert_eq!(ts1_kept.len(), 6, "Timestamp 1000 should keep 6 records (3 high CPU + 3 high network)"); // High CPU records (1, 2, 3) should be kept - assert!( - ts1_kept.contains(&1), - "Timestamp 1000 should keep sql_id 1 (high CPU)" - ); - assert!( - ts1_kept.contains(&2), - "Timestamp 1000 should keep sql_id 2 (high CPU)" - ); - assert!( - ts1_kept.contains(&3), - "Timestamp 1000 should keep sql_id 3 (high CPU)" - ); + assert!(ts1_kept.contains(&1), "Timestamp 1000 should keep sql_id 1 (high CPU)"); + assert!(ts1_kept.contains(&2), "Timestamp 1000 should keep sql_id 2 (high CPU)"); + assert!(ts1_kept.contains(&3), "Timestamp 1000 should keep sql_id 3 (high CPU)"); // High network records (4, 5, 6) should be kept - assert!( - ts1_kept.contains(&4), - "Timestamp 1000 should keep sql_id 4 (high network)" - ); - assert!( - ts1_kept.contains(&5), - "Timestamp 1000 should keep sql_id 5 (high network)" - ); - assert!( - ts1_kept.contains(&6), - "Timestamp 1000 should keep sql_id 6 (high network)" - ); + assert!(ts1_kept.contains(&4), "Timestamp 1000 should keep sql_id 4 (high network)"); + assert!(ts1_kept.contains(&5), "Timestamp 1000 should keep sql_id 5 (high network)"); + assert!(ts1_kept.contains(&6), "Timestamp 1000 should keep sql_id 6 (high network)"); // Low both records (7, 8) should be evicted - assert!( - !ts1_kept.contains(&7), - "Timestamp 1000 should NOT keep sql_id 7 (low both)" - ); - assert!( - !ts1_kept.contains(&8), - "Timestamp 1000 should NOT keep sql_id 8 (low both)" - ); - + assert!(!ts1_kept.contains(&7), "Timestamp 1000 should NOT keep sql_id 7 (low both)"); + assert!(!ts1_kept.contains(&8), "Timestamp 1000 should NOT keep sql_id 8 (low both)"); + // Verify kept records meet at least one threshold if let Some(records) = results_by_timestamp.get(×tamp1) { let cpu_threshold = 20u32; @@ -791,22 +744,14 @@ mod tests { ); } } - + if let Some((others_cpu, others_network)) = others_by_timestamp.get(×tamp1) { - assert_eq!( - *others_cpu, - 20 + 15, - "Timestamp 1000 others CPU should be 35 (20+15)" - ); - assert_eq!( - *others_network, - 40 + 30, - "Timestamp 1000 others network should be 70 (40+30)" - ); + assert_eq!(*others_cpu, 20 + 15, "Timestamp 1000 others CPU should be 35 (20+15)"); + assert_eq!(*others_network, 40 + 30, "Timestamp 1000 others network should be 70 (40+30)"); } else { panic!("Timestamp 1000 should have others records"); } - + // Verify timestamp 2000: should keep 6 records (3 high CPU + 3 high network), evict 1 // CPU threshold = 20 (4th largest), keep records with CPU > 20 // Network threshold = 60 (4th largest), keep records with network > 60 @@ -814,43 +759,18 @@ mod tests { .get(×tamp2) .map(|records| records.iter().map(|r| r.0).collect()) .unwrap_or_default(); - assert_eq!( - ts2_kept.len(), - 6, - "Timestamp 2000 should keep 6 records (3 high CPU + 3 high network)" - ); + assert_eq!(ts2_kept.len(), 6, "Timestamp 2000 should keep 6 records (3 high CPU + 3 high network)"); // High CPU records (9, 10, 11) should be kept - assert!( - ts2_kept.contains(&9), - "Timestamp 2000 should keep sql_id 9 (high CPU)" - ); - assert!( - ts2_kept.contains(&10), - "Timestamp 2000 should keep sql_id 10 (high CPU)" - ); - assert!( - ts2_kept.contains(&11), - "Timestamp 2000 should keep sql_id 11 (high CPU)" - ); + assert!(ts2_kept.contains(&9), "Timestamp 2000 should keep sql_id 9 (high CPU)"); + assert!(ts2_kept.contains(&10), "Timestamp 2000 should keep sql_id 10 (high CPU)"); + assert!(ts2_kept.contains(&11), "Timestamp 2000 should keep sql_id 11 (high CPU)"); // High network records (12, 13, 14) should be kept - assert!( - ts2_kept.contains(&12), - "Timestamp 2000 should keep sql_id 12 (high network)" - ); - assert!( - ts2_kept.contains(&13), - "Timestamp 2000 should keep sql_id 13 (high network)" - ); - assert!( - ts2_kept.contains(&14), - "Timestamp 2000 should keep sql_id 14 (high network)" - ); + assert!(ts2_kept.contains(&12), "Timestamp 2000 should keep sql_id 12 (high network)"); + assert!(ts2_kept.contains(&13), "Timestamp 2000 should keep sql_id 13 (high network)"); + assert!(ts2_kept.contains(&14), "Timestamp 2000 should keep sql_id 14 (high network)"); // Low both record (15) should be evicted - assert!( - !ts2_kept.contains(&15), - "Timestamp 2000 should NOT keep sql_id 15 (low both)" - ); - + assert!(!ts2_kept.contains(&15), "Timestamp 2000 should NOT keep sql_id 15 (low both)"); + // Verify kept records meet at least one threshold if let Some(records) = results_by_timestamp.get(×tamp2) { let cpu_threshold = 20u32; @@ -865,47 +785,28 @@ mod tests { ); } } - + if let Some((others_cpu, others_network)) = others_by_timestamp.get(×tamp2) { assert_eq!(*others_cpu, 20, "Timestamp 2000 others CPU should be 20"); - assert_eq!( - *others_network, 60, - "Timestamp 2000 others network should be 60 (30+30)" - ); + assert_eq!(*others_network, 60, "Timestamp 2000 others network should be 60 (30+30)"); } else { panic!("Timestamp 2000 should have others records"); } - + // Verify timestamp 3000: should keep all 2 records (2 <= top_n=3) let ts3_kept: Vec = results_by_timestamp .get(×tamp3) .map(|records| records.iter().map(|r| r.0).collect()) .unwrap_or_default(); - assert_eq!( - ts3_kept.len(), - 2, - "Timestamp 3000 should keep all 2 records" - ); - assert!( - ts3_kept.contains(&16), - "Timestamp 3000 should keep sql_id 16" - ); - assert!( - ts3_kept.contains(&17), - "Timestamp 3000 should keep sql_id 17" - ); - + assert_eq!(ts3_kept.len(), 2, "Timestamp 3000 should keep all 2 records"); + assert!(ts3_kept.contains(&16), "Timestamp 3000 should keep sql_id 16"); + assert!(ts3_kept.contains(&17), "Timestamp 3000 should keep sql_id 17"); + // Timestamp 3000 should not have others since all records are kept - assert!( - !others_by_timestamp.contains_key(×tamp3), - "Timestamp 3000 should not have others" - ); - + assert!(!others_by_timestamp.contains_key(×tamp3), "Timestamp 3000 should not have others"); + // Verify total counts - let total_kept: usize = results_by_timestamp - .values() - .map(|records| records.len()) - .sum(); + let total_kept: usize = results_by_timestamp.values().map(|records| records.len()).sum(); assert_eq!(total_kept, 14, "Total kept records should be 14 (6+6+2)"); } @@ -976,10 +877,7 @@ mod tests { assert_eq!(sum_old.stmt_duration_count, sum_new.stmt_duration_count); assert_eq!(sum_old.stmt_duration_sum_ns, sum_new.stmt_duration_sum_ns); assert_eq!(sum_old.stmt_network_in_bytes, sum_new.stmt_network_in_bytes); - assert_eq!( - sum_old.stmt_network_out_bytes, - sum_new.stmt_network_out_bytes - ); + assert_eq!(sum_old.stmt_network_out_bytes, sum_new.stmt_network_out_bytes); } #[test] @@ -1011,82 +909,25 @@ mod tests { // Check first event let event1 = &events[0]; let log1 = event1; - assert_eq!( - log1.get(LABEL_SOURCE_TABLE), - Some(&LogValue::from(SOURCE_TABLE_TOPRU)) - ); - assert_eq!( - log1.get(LABEL_TIMESTAMPS), - Some(&LogValue::from(1709646900)) - ); + assert_eq!(log1.get(LABEL_SOURCE_TABLE), Some(&LogValue::from(SOURCE_TABLE_TOPRU))); + assert_eq!(log1.get(LABEL_TIMESTAMPS), Some(&LogValue::from(1709646900))); assert_eq!(log1.get(LABEL_DATE), Some(&LogValue::from("2024-03-05"))); - assert_eq!( - log1.get(LABEL_KEYSPACE), - Some(&LogValue::from("test_keyspace")) - ); + assert_eq!(log1.get(LABEL_KEYSPACE), Some(&LogValue::from("test_keyspace"))); assert_eq!(log1.get(LABEL_USER), Some(&LogValue::from("test_user"))); - assert_eq!( - log1.get(LABEL_SQL_DIGEST), - Some(&LogValue::from("73716C5F6469676573745F313233")) - ); - assert_eq!( - log1.get(LABEL_PLAN_DIGEST), - Some(&LogValue::from("706C616E5F6469676573745F343536")) - ); + assert_eq!(log1.get(LABEL_SQL_DIGEST), Some(&LogValue::from("73716C5F6469676573745F313233"))); + assert_eq!(log1.get(LABEL_PLAN_DIGEST), Some(&LogValue::from("706C616E5F6469676573745F343536"))); assert_eq!(log1.get(METRIC_NAME_TOTAL_RU), Some(&LogValue::from(100.5))); assert_eq!(log1.get(METRIC_NAME_EXEC_COUNT), Some(&LogValue::from(10))); - assert_eq!( - log1.get(METRIC_NAME_EXEC_DURATION), - Some(&LogValue::from(50000000)) - ); + assert_eq!(log1.get(METRIC_NAME_EXEC_DURATION), Some(&LogValue::from(50000000))); // Check second event let event2 = &events[1]; let log2 = event2; - assert_eq!( - log2.get(LABEL_SOURCE_TABLE), - Some(&LogValue::from(SOURCE_TABLE_TOPRU)) - ); - assert_eq!( - log2.get(LABEL_TIMESTAMPS), - Some(&LogValue::from(1709646960)) - ); + assert_eq!(log2.get(LABEL_SOURCE_TABLE), Some(&LogValue::from(SOURCE_TABLE_TOPRU))); + assert_eq!(log2.get(LABEL_TIMESTAMPS), Some(&LogValue::from(1709646960))); assert_eq!(log2.get(LABEL_DATE), Some(&LogValue::from("2024-03-05"))); assert_eq!(log2.get(METRIC_NAME_TOTAL_RU), Some(&LogValue::from(200.0))); assert_eq!(log2.get(METRIC_NAME_EXEC_COUNT), Some(&LogValue::from(20))); - assert_eq!( - log2.get(METRIC_NAME_EXEC_DURATION), - Some(&LogValue::from(100000000)) - ); - } - - #[test] - fn test_parse_top_ru_record_falls_back_to_user_prefix_for_keyspace() { - let ru_record = TopRuRecord { - keyspace_name: vec![], - user: "test_keyspace.root@%".to_string(), - sql_digest: b"sql_digest_123".to_vec(), - plan_digest: b"plan_digest_456".to_vec(), - items: vec![TopRuRecordItem { - timestamp_sec: 1709646900, - total_ru: 54.4, - exec_count: 5, - exec_duration: 123456789, - }], - }; - - let events = TopSqlSubResponseParser::parse_top_ru_record(ru_record); - assert_eq!(events.len(), 1); - - let log = &events[0]; - assert_eq!( - log.get(LABEL_KEYSPACE), - Some(&LogValue::from("test_keyspace")) - ); - assert_eq!( - log.get(LABEL_USER), - Some(&LogValue::from("test_keyspace.root@%")) - ); - assert_eq!(log.get(METRIC_NAME_TOTAL_RU), Some(&LogValue::from(54.4))); + assert_eq!(log2.get(METRIC_NAME_EXEC_DURATION), Some(&LogValue::from(100000000))); } } From f56120cbbbdb13c058ffa3e0acfc33471d6d2ad0 Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Mon, 23 Mar 2026 13:51:19 +0800 Subject: [PATCH 09/26] feat: shard manager tidb by keyspace name --- src/common/topology/arch.md | 10 + src/common/topology/fetch/tidb_manager.rs | 261 +++++++++++++++++++++- 2 files changed, 267 insertions(+), 4 deletions(-) diff --git a/src/common/topology/arch.md b/src/common/topology/arch.md index fd34373..79e55ef 100644 --- a/src/common/topology/arch.md +++ b/src/common/topology/arch.md @@ -53,6 +53,7 @@ Components (Sources/Sinks) - **TiKV**: Discovers TiKV store instances - **TiFlash**: Discovers TiFlash instances - **Store**: Discovers store information +- **Manager-based TiDB discovery**: In legacy mode, TiDB instances can be fetched from manager `/api/tidb/get_active_tidb` ### Next-Gen Support @@ -71,6 +72,15 @@ pub struct TopologyFetcher { } ``` +Legacy manager-based TiDB discovery also supports keyspace-aware sharding: + +- `manager_server_address`: fetch active TiDB instances from manager instead of etcd +- `tidb_namespace`: namespace list sent to `/api/tidb/get_active_tidb?namespace=...` +- `VECTOR_STS_REPLICA_COUNT`: total number of Vector StatefulSet replicas +- `VECTOR_STS_ID`: current Vector StatefulSet ordinal + +When both shard envs are set, the manager response must include `keyspace_name`. The topology fetcher hashes `keyspace_name`, applies modulo `VECTOR_STS_REPLICA_COUNT`, and only keeps entries whose shard matches `VECTOR_STS_ID`. Entries without `keyspace_name` are skipped in this mode. + ## Topology Data ### Component Information diff --git a/src/common/topology/fetch/tidb_manager.rs b/src/common/topology/fetch/tidb_manager.rs index 21a94b3..f1ae586 100644 --- a/src/common/topology/fetch/tidb_manager.rs +++ b/src/common/topology/fetch/tidb_manager.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::{collections::HashSet, env}; use serde_json::{Map, Value}; use snafu::{ResultExt, Snafu}; @@ -11,6 +11,10 @@ const GET_ACTIVE_TIDB_PATH: &str = "/api/tidb/get_active_tidb"; const DEFAULT_TIDB_PRIMARY_PORT: u16 = 4000; const DEFAULT_TIDB_STATUS_PORT: u16 = 10080; const MAX_RESPONSE_DEPTH: usize = 8; +const VECTOR_STS_REPLICA_COUNT_ENV: &str = "VECTOR_STS_REPLICA_COUNT"; +const VECTOR_STS_ID_ENV: &str = "VECTOR_STS_ID"; +const FNV1A_64_OFFSET_BASIS: u64 = 0xcbf29ce484222325; +const FNV1A_64_PRIME: u64 = 0x100000001b3; #[derive(Debug, Snafu)] pub enum FetchError { @@ -24,6 +28,8 @@ pub enum FetchError { ActiveTiDBJsonFromStr { source: serde_json::Error }, #[snafu(display("Invalid manager server response: {}", message))] InvalidManagerResponse { message: String }, + #[snafu(display("Invalid manager keyspace shard config: {}", message))] + InvalidShardConfig { message: String }, #[snafu(display("Failed to parse tidb host from manager response: {}", source))] ParseTiDBHost { source: utils::ParseError }, } @@ -34,6 +40,13 @@ struct ActiveTiDBAddress { port: Option, status_port: Option, hostname: Option, + keyspace_name: Option, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +struct ManagerShardConfig { + replica_count: u64, + sts_id: u64, } pub struct TiDBManagerTopologyFetcher<'a> { @@ -59,7 +72,11 @@ impl<'a> TiDBManagerTopologyFetcher<'a> { &self, components: &mut HashSet, ) -> Result<(), FetchError> { - let active_tidb_addresses = self.fetch_active_tidb_addresses().await?; + let shard_config = Self::read_manager_shard_config_from_env()?; + let active_tidb_addresses = Self::filter_active_tidb_addresses( + self.fetch_active_tidb_addresses().await?, + shard_config, + )?; if !active_tidb_addresses.is_empty() { info!( message = "Fetched active TiDB instances from manager server", @@ -185,6 +202,7 @@ impl<'a> TiDBManagerTopologyFetcher<'a> { port: None, status_port: None, hostname: None, + keyspace_name: None, }]), Value::Array(items) => { let mut addresses = Vec::new(); @@ -235,12 +253,15 @@ impl<'a> TiDBManagerTopologyFetcher<'a> { let port = Self::extract_u16_field(obj, &["port", "primary_port"]); let status_port = Self::extract_u16_field(obj, &["status_port", "secondary_port"]); let hostname = Self::extract_string_field(obj, &["hostname", "pod_name", "instance_name"]); + let keyspace_name = + Self::extract_string_field(obj, &["keyspace_name", "keyspaceName", "keyspace"]); Some(ActiveTiDBAddress { host, port, status_port, hostname, + keyspace_name, }) } @@ -256,6 +277,130 @@ impl<'a> TiDBManagerTopologyFetcher<'a> { .and_then(|raw| u16::try_from(raw).ok()) }) } + + fn read_manager_shard_config_from_env() -> Result, FetchError> { + ManagerShardConfig::from_env_values( + env::var(VECTOR_STS_REPLICA_COUNT_ENV).ok().as_deref(), + env::var(VECTOR_STS_ID_ENV).ok().as_deref(), + ) + } + + fn filter_active_tidb_addresses( + active_tidb_addresses: Vec, + shard_config: Option, + ) -> Result, FetchError> { + let Some(shard_config) = shard_config else { + return Ok(active_tidb_addresses); + }; + + let total_tidb_count = active_tidb_addresses.len(); + let mut filtered_tidbs = Vec::new(); + let mut skipped_missing_keyspace_count = 0usize; + + for active_tidb in active_tidb_addresses { + let Some(keyspace_name) = active_tidb + .keyspace_name + .as_deref() + .map(str::trim) + .filter(|name| !name.is_empty()) + else { + skipped_missing_keyspace_count += 1; + warn!( + message = "Skipping manager active TiDB without keyspace_name while keyspace sharding is enabled", + host = active_tidb.host, + port = ?active_tidb.port, + status_port = ?active_tidb.status_port, + hostname = ?active_tidb.hostname, + replica_count = shard_config.replica_count, + sts_id = shard_config.sts_id + ); + continue; + }; + + let shard = Self::hash_keyspace_name(keyspace_name) % shard_config.replica_count; + if shard == shard_config.sts_id { + filtered_tidbs.push(active_tidb); + } + } + + info!( + message = "Applied manager keyspace sharding to active TiDB instances", + replica_count = shard_config.replica_count, + sts_id = shard_config.sts_id, + total_tidb_count, + selected_tidb_count = filtered_tidbs.len(), + skipped_missing_keyspace_count + ); + + Ok(filtered_tidbs) + } + + fn hash_keyspace_name(keyspace_name: &str) -> u64 { + let mut hash = FNV1A_64_OFFSET_BASIS; + for byte in keyspace_name.as_bytes() { + hash ^= u64::from(*byte); + hash = hash.wrapping_mul(FNV1A_64_PRIME); + } + hash + } +} + +impl ManagerShardConfig { + fn from_env_values( + replica_count: Option<&str>, + sts_id: Option<&str>, + ) -> Result, FetchError> { + let replica_count = replica_count + .map(str::trim) + .filter(|value| !value.is_empty()); + let sts_id = sts_id.map(str::trim).filter(|value| !value.is_empty()); + + match (replica_count, sts_id) { + (None, None) => Ok(None), + (Some(_), None) => Err(FetchError::InvalidShardConfig { + message: format!( + "{VECTOR_STS_REPLICA_COUNT_ENV} is set but {VECTOR_STS_ID_ENV} is missing" + ), + }), + (None, Some(_)) => Err(FetchError::InvalidShardConfig { + message: format!( + "{VECTOR_STS_ID_ENV} is set but {VECTOR_STS_REPLICA_COUNT_ENV} is missing" + ), + }), + (Some(replica_count), Some(sts_id)) => { + let replica_count = + Self::parse_u64_env(VECTOR_STS_REPLICA_COUNT_ENV, replica_count)?; + let sts_id = Self::parse_u64_env(VECTOR_STS_ID_ENV, sts_id)?; + + if replica_count == 0 { + return Err(FetchError::InvalidShardConfig { + message: format!("{VECTOR_STS_REPLICA_COUNT_ENV} must be greater than 0"), + }); + } + + if sts_id >= replica_count { + return Err(FetchError::InvalidShardConfig { + message: format!( + "{VECTOR_STS_ID_ENV} ({sts_id}) must be smaller than {VECTOR_STS_REPLICA_COUNT_ENV} ({replica_count})" + ), + }); + } + + Ok(Some(Self { + replica_count, + sts_id, + })) + } + } + } + + fn parse_u64_env(env_name: &str, raw_value: &str) -> Result { + raw_value + .parse::() + .map_err(|_| FetchError::InvalidShardConfig { + message: format!("{env_name} must be a non-negative integer, got {raw_value}"), + }) + } } #[cfg(test)] @@ -265,8 +410,8 @@ mod tests { #[test] fn parse_response_new_schema() { let bytes = br#"[ - {"host":"10.0.0.1","port":4000,"status_port":10080,"hostname":"tidb-0"}, - {"host":"10.0.0.2","port":4000,"status_port":10080,"hostname":"tidb-1"} + {"host":"10.0.0.1","port":4000,"status_port":10080,"hostname":"tidb-0","keyspace_name":"tenant-a"}, + {"host":"10.0.0.2","port":4000,"status_port":10080,"hostname":"tidb-1","keyspace_name":"tenant-b"} ]"#; let addresses = TiDBManagerTopologyFetcher::parse_active_tidb_addresses_response(bytes).unwrap(); @@ -279,17 +424,32 @@ mod tests { port: Some(4000), status_port: Some(10080), hostname: Some("tidb-0".to_owned()), + keyspace_name: Some("tenant-a".to_owned()), }, ActiveTiDBAddress { host: "10.0.0.2".to_owned(), port: Some(4000), status_port: Some(10080), hostname: Some("tidb-1".to_owned()), + keyspace_name: Some("tenant-b".to_owned()), } ] ); } + #[test] + fn parse_response_supports_keyspace_aliases() { + let bytes = br#"[ + {"host":"10.0.0.1","keyspace":"tenant-a"}, + {"host":"10.0.0.2","keyspaceName":"tenant-b"} + ]"#; + let addresses = + TiDBManagerTopologyFetcher::parse_active_tidb_addresses_response(bytes).unwrap(); + + assert_eq!(addresses[0].keyspace_name.as_deref(), Some("tenant-a")); + assert_eq!(addresses[1].keyspace_name.as_deref(), Some("tenant-b")); + } + #[test] fn parse_response_invalid_format() { let bytes = br#"{"code":0,"message":"ok"}"#; @@ -370,4 +530,97 @@ mod tests { Some("super-vip-tidb-pool,canary-super-vip-tidb-pool") ); } + + #[test] + fn manager_shard_config_is_disabled_when_envs_are_missing() { + assert_eq!( + ManagerShardConfig::from_env_values(None, None).unwrap(), + None + ); + } + + #[test] + fn manager_shard_config_requires_both_envs() { + let err = ManagerShardConfig::from_env_values(Some("3"), None) + .expect_err("expected missing sts id to fail"); + assert!(matches!(err, FetchError::InvalidShardConfig { .. })); + + let err = ManagerShardConfig::from_env_values(None, Some("1")) + .expect_err("expected missing replica count to fail"); + assert!(matches!(err, FetchError::InvalidShardConfig { .. })); + } + + #[test] + fn manager_shard_config_validates_values() { + let err = ManagerShardConfig::from_env_values(Some("0"), Some("0")) + .expect_err("expected zero replica count to fail"); + assert!(matches!(err, FetchError::InvalidShardConfig { .. })); + + let err = ManagerShardConfig::from_env_values(Some("2"), Some("2")) + .expect_err("expected sts id overflow to fail"); + assert!(matches!(err, FetchError::InvalidShardConfig { .. })); + + let err = ManagerShardConfig::from_env_values(Some("abc"), Some("1")) + .expect_err("expected invalid replica count to fail"); + assert!(matches!(err, FetchError::InvalidShardConfig { .. })); + } + + #[test] + fn hash_keyspace_name_is_stable() { + assert_eq!( + TiDBManagerTopologyFetcher::hash_keyspace_name("tenant-a"), + 14046587775414411003 + ); + assert_eq!( + TiDBManagerTopologyFetcher::hash_keyspace_name("tenant-b"), + 14046588874926039214 + ); + } + + #[test] + fn filter_active_tidb_addresses_by_keyspace_shard() { + let addresses = vec![ + ActiveTiDBAddress { + host: "10.0.0.1".to_owned(), + port: Some(4000), + status_port: Some(10080), + hostname: Some("tidb-0".to_owned()), + keyspace_name: Some("tenant-a".to_owned()), + }, + ActiveTiDBAddress { + host: "10.0.0.2".to_owned(), + port: Some(4000), + status_port: Some(10080), + hostname: Some("tidb-1".to_owned()), + keyspace_name: Some("tenant-b".to_owned()), + }, + ActiveTiDBAddress { + host: "10.0.0.3".to_owned(), + port: Some(4000), + status_port: Some(10080), + hostname: Some("tidb-2".to_owned()), + keyspace_name: None, + }, + ]; + + let filtered = TiDBManagerTopologyFetcher::filter_active_tidb_addresses( + addresses, + Some(ManagerShardConfig { + replica_count: 4, + sts_id: 2, + }), + ) + .unwrap(); + + assert_eq!( + filtered, + vec![ActiveTiDBAddress { + host: "10.0.0.2".to_owned(), + port: Some(4000), + status_port: Some(10080), + hostname: Some("tidb-1".to_owned()), + keyspace_name: Some("tenant-b".to_owned()), + }] + ); + } } From 167a2de6c33f123805cfe241d4a5ec2497a978a6 Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Mon, 23 Mar 2026 14:45:21 +0800 Subject: [PATCH 10/26] fix: address manager topology review feedback --- src/common/keyspace_cluster.rs | 69 +++++++++++++++++++++-- src/common/topology/arch.md | 2 +- src/common/topology/fetch/mod.rs | 60 ++++++++++++++++++++ src/common/topology/fetch/tidb_manager.rs | 60 ++++++++++++++++++-- 4 files changed, 180 insertions(+), 11 deletions(-) diff --git a/src/common/keyspace_cluster.rs b/src/common/keyspace_cluster.rs index 41a5249..83befa2 100644 --- a/src/common/keyspace_cluster.rs +++ b/src/common/keyspace_cluster.rs @@ -1,8 +1,10 @@ use std::collections::HashMap; use std::fs; +use std::num::NonZeroUsize; use std::sync::Arc; use std::time::Duration; +use lru::LruCache; use reqwest::{Certificate, Client, Identity, StatusCode}; use serde::Deserialize; use tokio::sync::Mutex; @@ -13,6 +15,7 @@ type BoxError = Box; const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); const CONNECT_TIMEOUT: Duration = Duration::from_secs(3); +const DEFAULT_KEYSPACE_ROUTE_CACHE_CAPACITY: usize = 10_000; const ORG_ID_KEYS: &[&str] = &["serverless_tenant_id"]; const CLUSTER_ID_KEYS: &[&str] = &["serverless_cluster_id"]; @@ -27,7 +30,7 @@ pub struct KeyspaceRoute { pub struct PdKeyspaceResolver { base_url: String, client: Client, - cache: Arc>>, + cache: Arc>>, } #[derive(Debug, Deserialize)] @@ -45,11 +48,27 @@ impl PdKeyspaceResolver { pd_address: impl Into, pd_tls: Option<&TlsConfig>, client: Client, + ) -> Self { + Self::new_with_client_and_capacity( + pd_address, + pd_tls, + client, + DEFAULT_KEYSPACE_ROUTE_CACHE_CAPACITY, + ) + } + + fn new_with_client_and_capacity( + pd_address: impl Into, + pd_tls: Option<&TlsConfig>, + client: Client, + cache_capacity: usize, ) -> Self { Self { base_url: normalize_pd_address(&pd_address.into(), pd_tls.is_some()), client, - cache: Arc::new(Mutex::new(HashMap::new())), + cache: Arc::new(Mutex::new(LruCache::new( + NonZeroUsize::new(cache_capacity.max(1)).unwrap(), + ))), } } @@ -61,9 +80,11 @@ impl PdKeyspaceResolver { return Ok(None); } - if let Some(cached) = self.cache.lock().await.get(keyspace_name).cloned() { + let mut cache = self.cache.lock().await; + if let Some(cached) = cache.get(keyspace_name).cloned() { return Ok(Some(cached)); } + drop(cache); let encoded_keyspace = byte_serialize(keyspace_name.as_bytes()).collect::(); let response = self @@ -98,8 +119,10 @@ impl PdKeyspaceResolver { self.cache .lock() .await - .insert(keyspace_name.to_string(), route); + .put(keyspace_name.to_string(), route); } + // Intentionally do not cache misses or transient failures so a later retry can recover + // once PD metadata becomes visible. Ok(route) } @@ -328,4 +351,42 @@ mod tests { server_handle.abort(); } + + #[tokio::test] + async fn resolve_keyspace_cache_is_bounded() { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let address = listener.local_addr().unwrap(); + let server = Server::from_tcp(listener) + .unwrap() + .serve(make_service_fn(move |_| async move { + Ok::<_, Infallible>(service_fn(move |request: Request| async move { + let keyspace_name = request + .uri() + .path() + .trim_start_matches("/pd/api/v2/keyspaces/"); + let body = format!( + r#"{{"config":{{"serverless_tenant_id":"30018","serverless_cluster_id":"{}"}}}}"#, + keyspace_name + ); + Ok::<_, Infallible>(Response::new(Body::from(body))) + })) + })); + let server_handle = tokio::spawn(server); + + let client = Client::builder().no_proxy().build().unwrap(); + let resolver = PdKeyspaceResolver::new_with_client_and_capacity( + format!("http://{}", address), + None, + client, + 2, + ); + + let _ = resolver.resolve_keyspace("ks-1").await.unwrap(); + let _ = resolver.resolve_keyspace("ks-2").await.unwrap(); + let _ = resolver.resolve_keyspace("ks-3").await.unwrap(); + + assert_eq!(resolver.cache.lock().await.len(), 2); + + server_handle.abort(); + } } diff --git a/src/common/topology/arch.md b/src/common/topology/arch.md index 79e55ef..3566266 100644 --- a/src/common/topology/arch.md +++ b/src/common/topology/arch.md @@ -75,7 +75,7 @@ pub struct TopologyFetcher { Legacy manager-based TiDB discovery also supports keyspace-aware sharding: - `manager_server_address`: fetch active TiDB instances from manager instead of etcd -- `tidb_namespace`: namespace list sent to `/api/tidb/get_active_tidb?namespace=...` +- `tidb_namespace`: namespace list sent to `/api/tidb/get_active_tidb?namespace=...`; required when `manager_server_address` is set - `VECTOR_STS_REPLICA_COUNT`: total number of Vector StatefulSet replicas - `VECTOR_STS_ID`: current Vector StatefulSet ordinal diff --git a/src/common/topology/fetch/mod.rs b/src/common/topology/fetch/mod.rs index 4403649..2c739a2 100644 --- a/src/common/topology/fetch/mod.rs +++ b/src/common/topology/fetch/mod.rs @@ -77,6 +77,8 @@ impl LegacyTopologyFetcher { let manager_server_address = manager_server_address .map(Self::polish_manager_server_address) .transpose()?; + let tidb_namespace = + Self::normalize_tidb_namespace(manager_server_address.as_deref(), tidb_namespace)?; let http_client = Self::build_http_client(tls_config.as_ref(), proxy_config)?; let etcd_client = Self::build_etcd_client(&pd_address, &tls_config).await?; @@ -137,6 +139,33 @@ impl LegacyTopologyFetcher { Ok(address) } + fn normalize_tidb_namespace( + manager_server_address: Option<&str>, + tidb_namespace: Option, + ) -> Result, FetchError> { + let tidb_namespace = tidb_namespace.and_then(|namespaces| { + let normalized = namespaces + .split(',') + .map(str::trim) + .filter(|namespace| !namespace.is_empty()) + .collect::>(); + if normalized.is_empty() { + None + } else { + Some(normalized.join(",")) + } + }); + + if manager_server_address.is_some() && tidb_namespace.is_none() { + return Err(FetchError::ConfigurationError { + message: "tidb_namespace is required when manager_server_address is configured" + .to_string(), + }); + } + + Ok(tidb_namespace) + } + fn polish_manager_server_address(mut address: String) -> Result { let uri: hyper::Uri = address.parse().context(ParseAddressSnafu)?; if uri.scheme().is_none() { @@ -356,3 +385,34 @@ impl TopologyFetcher { // println!("{:?}", components); // } // } + +#[cfg(test)] +mod tests { + use super::LegacyTopologyFetcher; + + #[test] + fn normalize_tidb_namespace_requires_value_when_manager_is_configured() { + let err = + LegacyTopologyFetcher::normalize_tidb_namespace(Some("http://manager:8080"), None) + .expect_err("expected missing namespace to fail"); + + assert!(matches!(err, super::FetchError::ConfigurationError { .. })); + } + + #[test] + fn normalize_tidb_namespace_trims_and_joins_values() { + let normalized = LegacyTopologyFetcher::normalize_tidb_namespace( + Some("http://manager:8080"), + Some(" ns-a, ns-b , ".to_string()), + ) + .unwrap(); + + assert_eq!(normalized.as_deref(), Some("ns-a,ns-b")); + } + + #[test] + fn normalize_tidb_namespace_allows_missing_value_without_manager() { + let normalized = LegacyTopologyFetcher::normalize_tidb_namespace(None, None).unwrap(); + assert_eq!(normalized, None); + } +} diff --git a/src/common/topology/fetch/tidb_manager.rs b/src/common/topology/fetch/tidb_manager.rs index f1ae586..7596c92 100644 --- a/src/common/topology/fetch/tidb_manager.rs +++ b/src/common/topology/fetch/tidb_manager.rs @@ -1,5 +1,6 @@ use std::{collections::HashSet, env}; +use hyper::body::HttpBody; use serde_json::{Map, Value}; use snafu::{ResultExt, Snafu}; use vector::http::HttpClient; @@ -11,6 +12,7 @@ const GET_ACTIVE_TIDB_PATH: &str = "/api/tidb/get_active_tidb"; const DEFAULT_TIDB_PRIMARY_PORT: u16 = 4000; const DEFAULT_TIDB_STATUS_PORT: u16 = 10080; const MAX_RESPONSE_DEPTH: usize = 8; +const MAX_MANAGER_RESPONSE_BYTES: usize = 8 * 1024 * 1024; const VECTOR_STS_REPLICA_COUNT_ENV: &str = "VECTOR_STS_REPLICA_COUNT"; const VECTOR_STS_ID_ENV: &str = "VECTOR_STS_ID"; const FNV1A_64_OFFSET_BASIS: u64 = 0xcbf29ce484222325; @@ -24,6 +26,8 @@ pub enum FetchError { GetActiveTiDBs { source: vector::http::HttpError }, #[snafu(display("Failed to read active tidb response bytes: {}", source))] GetActiveTiDBsBytes { source: hyper::Error }, + #[snafu(display("Manager active tidb response exceeds limit of {} bytes", limit_bytes))] + ActiveTiDBResponseTooLarge { limit_bytes: usize }, #[snafu(display("Failed to parse active tidb response JSON text: {}", source))] ActiveTiDBJsonFromStr { source: serde_json::Error }, #[snafu(display("Invalid manager server response: {}", message))] @@ -77,15 +81,24 @@ impl<'a> TiDBManagerTopologyFetcher<'a> { self.fetch_active_tidb_addresses().await?, shard_config, )?; - if !active_tidb_addresses.is_empty() { + if active_tidb_addresses.is_empty() { info!( - message = "Fetched active TiDB instances from manager server", + message = "No active TiDB instances selected from manager server", manager_server_address = self.manager_server_address, tidb_namespace = ?self.tidb_namespace, - tidb_count = active_tidb_addresses.len() + tidb_count = 0, + shard_config = ?shard_config ); + return Ok(()); } + info!( + message = "Fetched active TiDB instances from manager server", + manager_server_address = self.manager_server_address, + tidb_namespace = ?self.tidb_namespace, + tidb_count = active_tidb_addresses.len() + ); + for active_tidb in active_tidb_addresses { let (host, primary_port) = Self::parse_tidb_host_and_primary(&active_tidb.host, active_tidb.port)?; @@ -117,9 +130,7 @@ impl<'a> TiDBManagerTopologyFetcher<'a> { .send(req) .await .context(GetActiveTiDBsSnafu)?; - let bytes = hyper::body::to_bytes(res.into_body()) - .await - .context(GetActiveTiDBsBytesSnafu)?; + let bytes = Self::read_response_body_with_limit(res.into_body()).await?; Self::parse_active_tidb_addresses_response(&bytes) } @@ -343,6 +354,31 @@ impl<'a> TiDBManagerTopologyFetcher<'a> { } hash } + + async fn read_response_body_with_limit(mut body: hyper::Body) -> Result, FetchError> { + if body + .size_hint() + .upper() + .is_some_and(|upper| upper > MAX_MANAGER_RESPONSE_BYTES as u64) + { + return Err(FetchError::ActiveTiDBResponseTooLarge { + limit_bytes: MAX_MANAGER_RESPONSE_BYTES, + }); + } + + let mut bytes = Vec::new(); + while let Some(chunk) = body.data().await { + let chunk = chunk.context(GetActiveTiDBsBytesSnafu)?; + if bytes.len().saturating_add(chunk.len()) > MAX_MANAGER_RESPONSE_BYTES { + return Err(FetchError::ActiveTiDBResponseTooLarge { + limit_bytes: MAX_MANAGER_RESPONSE_BYTES, + }); + } + bytes.extend_from_slice(&chunk); + } + + Ok(bytes) + } } impl ManagerShardConfig { @@ -405,6 +441,8 @@ impl ManagerShardConfig { #[cfg(test)] mod tests { + use hyper::Body; + use super::*; #[test] @@ -623,4 +661,14 @@ mod tests { }] ); } + + #[tokio::test] + async fn read_response_body_with_limit_rejects_oversized_response() { + let body = Body::from(vec![b'x'; MAX_MANAGER_RESPONSE_BYTES + 1]); + let err = TiDBManagerTopologyFetcher::read_response_body_with_limit(body) + .await + .expect_err("expected oversized response to fail"); + + assert!(matches!(err, FetchError::ActiveTiDBResponseTooLarge { .. })); + } } From b8f997fcc960886fc0c6283cee6a2fe461b73510 Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Mon, 23 Mar 2026 21:29:51 +0800 Subject: [PATCH 11/26] feat: add tikv topsql collection switch --- src/sources/topsql_v2/arch.md | 1 + src/sources/topsql_v2/controller.rs | 44 +++++++++++++++++++++++++++++ src/sources/topsql_v2/mod.rs | 12 ++++++++ 3 files changed, 57 insertions(+) diff --git a/src/sources/topsql_v2/arch.md b/src/sources/topsql_v2/arch.md index 8cd9777..86c0182 100644 --- a/src/sources/topsql_v2/arch.md +++ b/src/sources/topsql_v2/arch.md @@ -51,6 +51,7 @@ Legacy mode discovery options: - `pd_address`: used for PD/store discovery and schema management - `manager_server_address`: optional manager endpoint used to fetch active TiDB instances - `tidb_namespace`: manager namespace list used when calling `/api/tidb/get_active_tidb` +- `enable_tikv_topsql`: whether to collect `tikv_topsql` and `tikv_topregion`; defaults to `true` ## Data Flow diff --git a/src/sources/topsql_v2/controller.rs b/src/sources/topsql_v2/controller.rs index e519939..efac206 100644 --- a/src/sources/topsql_v2/controller.rs +++ b/src/sources/topsql_v2/controller.rs @@ -28,6 +28,7 @@ pub struct Controller { init_retry_delay: Duration, top_n: usize, downsampling_interval: u32, + enable_tikv_topsql: bool, topru: TopRUConfig, schema_cache: Arc, @@ -51,6 +52,7 @@ impl Controller { init_retry_delay: Duration, top_n: usize, downsampling_interval: u32, + enable_tikv_topsql: bool, schema_update_interval: Duration, tls_config: Option, proxy_config: &ProxyConfig, @@ -85,6 +87,7 @@ impl Controller { init_retry_delay, top_n, downsampling_interval, + enable_tikv_topsql, topru, schema_cache, schema_update_interval, @@ -126,6 +129,8 @@ impl Controller { self.topo_fetcher .get_up_components(&mut latest_components) .await?; + latest_components + .retain(|component| should_collect_component(component, self.enable_tikv_topsql)); let prev_components = self.components.clone(); let newcomers = latest_components.difference(&prev_components); @@ -265,6 +270,10 @@ impl Controller { } fn start_component(&mut self, component: &Component) -> bool { + if !should_collect_component(component, self.enable_tikv_topsql) { + return false; + } + let source = TopSQLSource::new( component.clone(), self.tls.clone(), @@ -347,3 +356,38 @@ impl Controller { info!(message = "All TopSQL sources have been shut down."); } } + +fn should_collect_component(component: &Component, enable_tikv_topsql: bool) -> bool { + enable_tikv_topsql || component.instance_type != InstanceType::TiKV +} + +#[cfg(test)] +mod tests { + use super::*; + + fn component(instance_type: InstanceType) -> Component { + Component { + instance_type, + host: "127.0.0.1".to_string(), + primary_port: 20160, + secondary_port: 10080, + instance_name: None, + } + } + + #[test] + fn should_collect_tikv_component_only_when_enabled() { + assert!(should_collect_component( + &component(InstanceType::TiDB), + false + )); + assert!(!should_collect_component( + &component(InstanceType::TiKV), + false + )); + assert!(should_collect_component( + &component(InstanceType::TiKV), + true + )); + } +} diff --git a/src/sources/topsql_v2/mod.rs b/src/sources/topsql_v2/mod.rs index fbac2d0..6508829 100644 --- a/src/sources/topsql_v2/mod.rs +++ b/src/sources/topsql_v2/mod.rs @@ -93,6 +93,11 @@ pub struct TopSQLConfig { #[serde(default = "default_downsampling_interval")] pub downsampling_interval: u32, + /// Whether to collect TiKV TopSQL data (`tikv_topsql` and `tikv_topregion`). + /// When disabled, only TiDB TopSQL/TopRU data is collected. + #[serde(default = "default_enable_tikv_topsql")] + pub enable_tikv_topsql: bool, + /// TopRU (Resource Unit) collection config. Only applies to TiDB upstream. #[serde(default)] pub topru: TopRUConfig, @@ -114,6 +119,10 @@ pub const fn default_downsampling_interval() -> u32 { 60 } +pub const fn default_enable_tikv_topsql() -> bool { + true +} + impl GenerateConfig for TopSQLConfig { fn generate_config() -> toml::Value { toml::Value::try_from(Self { @@ -127,6 +136,7 @@ impl GenerateConfig for TopSQLConfig { topology_fetch_interval_seconds: default_topology_fetch_interval(), top_n: default_top_n(), downsampling_interval: default_downsampling_interval(), + enable_tikv_topsql: default_enable_tikv_topsql(), topru: TopRUConfig::default(), }) .unwrap() @@ -149,6 +159,7 @@ impl SourceConfig for TopSQLConfig { let init_retry_delay = Duration::from_secs_f64(self.init_retry_delay_seconds); let top_n = self.top_n; let downsampling_interval = self.downsampling_interval; + let enable_tikv_topsql = self.enable_tikv_topsql; let topru = self.topru.clone(); let schema_update_interval = Duration::from_secs(60); @@ -161,6 +172,7 @@ impl SourceConfig for TopSQLConfig { init_retry_delay, top_n, downsampling_interval, + enable_tikv_topsql, schema_update_interval, tls, &cx.proxy, From 9e8807f6cd599626df5ae206e873fb0a420e5989 Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Mon, 23 Mar 2026 22:16:59 +0800 Subject: [PATCH 12/26] feat: update topru storage path layout --- src/sinks/topsql_data_deltalake/arch.md | 1 + src/sinks/topsql_data_deltalake/processor.rs | 55 ++++++++++++++++---- 2 files changed, 47 insertions(+), 9 deletions(-) diff --git a/src/sinks/topsql_data_deltalake/arch.md b/src/sinks/topsql_data_deltalake/arch.md index 1a77bdf..ed7fb91 100644 --- a/src/sinks/topsql_data_deltalake/arch.md +++ b/src/sinks/topsql_data_deltalake/arch.md @@ -57,6 +57,7 @@ pub struct TopSQLDataDeltaLakeConfig { - **Time Partitioning**: Partition by execution time - **Schema Optimization**: Optimized schema for TopSQL data - **Keyspace-based Routing**: Optional PD keyspace lookup can prepend `org=/cluster=` path segments before the table layout, which is especially useful for `topru` data written to shared S3 prefixes +- **TopRU Path Layout**: `topsql_topru` is written under `type=topsql/component=topru/instance=default` so shared prefixes can keep a stable `type=topsql` partition while separating the TopRU payload by component ## Dependencies diff --git a/src/sinks/topsql_data_deltalake/processor.rs b/src/sinks/topsql_data_deltalake/processor.rs index a08e0df..653186d 100644 --- a/src/sinks/topsql_data_deltalake/processor.rs +++ b/src/sinks/topsql_data_deltalake/processor.rs @@ -519,23 +519,24 @@ impl TopSQLDeltaLakeSink { } fn build_table_path(&self, table_name: &str, route: Option<&KeyspaceRoute>) -> PathBuf { - let (table_type, table_instance) = Self::table_partition_values(table_name); - let mut segments = Vec::new(); if let Some(route) = route { segments.push(format!("org={}", route.org_id)); segments.push(format!("cluster={}", route.cluster_id)); } - segments.push(format!("type=topsql_{}", table_type)); - segments.push(format!("instance={}", table_instance)); + segments.extend(Self::table_partition_segments(table_name)); let segment_refs: Vec<&str> = segments.iter().map(|segment| segment.as_str()).collect(); Self::join_path(&self.base_path, &segment_refs) } - fn table_partition_values(table_name: &str) -> (&str, &str) { + fn table_partition_segments(table_name: &str) -> Vec { if table_name == SOURCE_TABLE_TOPRU { - ("topru", "default") + vec![ + "type=topsql".to_string(), + "component=topru".to_string(), + "instance=default".to_string(), + ] } else { match table_name .strip_prefix("topsql_") @@ -544,14 +545,20 @@ impl TopSQLDeltaLakeSink { Some((table_type, table_instance)) if !table_type.is_empty() && !table_instance.is_empty() => { - (table_type, table_instance) + vec![ + format!("type=topsql_{}", table_type), + format!("instance={}", table_instance), + ] } _ => { error!( "Unexpected table_name format (expected `topsql_{{type}}_{{instance}}` or `topsql_topru`): {}", table_name ); - ("unknown", "unknown") + vec![ + "type=topsql_unknown".to_string(), + "instance=unknown".to_string(), + ] } } } @@ -777,7 +784,37 @@ mod tests { assert_eq!( table_path, - PathBuf::from("/tmp/deltalake/type=topsql_topru/instance=default") + PathBuf::from("/tmp/deltalake/type=topsql/component=topru/instance=default") + ); + } + + #[test] + fn test_build_topru_table_path_with_meta_route_uses_component_partition() { + let (sink, _) = TopSQLDeltaLakeSink::new_for_test( + PathBuf::from("s3://o11y-prod-shared-us-west-2-premium/deltalake"), + vec![], + WriteConfig { + batch_size: 1, + timeout_secs: 0, + }, + 180, + None, + None, + ); + + let table_path = sink.build_table_path( + "topsql_topru", + Some(&KeyspaceRoute { + org_id: "1369847559692509642".to_string(), + cluster_id: "10110362358366286743".to_string(), + }), + ); + + assert_eq!( + table_path, + PathBuf::from( + "s3://o11y-prod-shared-us-west-2-premium/deltalake/org=1369847559692509642/cluster=10110362358366286743/type=topsql/component=topru/instance=default" + ) ); } From 6c2ebe5c309c40e8a6cd63b51681c3efcd706951 Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Tue, 24 Mar 2026 10:45:39 +0800 Subject: [PATCH 13/26] Revert "feat: update topru storage path layout" This reverts commit 9e8807f6cd599626df5ae206e873fb0a420e5989. --- src/sinks/topsql_data_deltalake/arch.md | 1 - src/sinks/topsql_data_deltalake/processor.rs | 55 ++++---------------- 2 files changed, 9 insertions(+), 47 deletions(-) diff --git a/src/sinks/topsql_data_deltalake/arch.md b/src/sinks/topsql_data_deltalake/arch.md index ed7fb91..1a77bdf 100644 --- a/src/sinks/topsql_data_deltalake/arch.md +++ b/src/sinks/topsql_data_deltalake/arch.md @@ -57,7 +57,6 @@ pub struct TopSQLDataDeltaLakeConfig { - **Time Partitioning**: Partition by execution time - **Schema Optimization**: Optimized schema for TopSQL data - **Keyspace-based Routing**: Optional PD keyspace lookup can prepend `org=/cluster=` path segments before the table layout, which is especially useful for `topru` data written to shared S3 prefixes -- **TopRU Path Layout**: `topsql_topru` is written under `type=topsql/component=topru/instance=default` so shared prefixes can keep a stable `type=topsql` partition while separating the TopRU payload by component ## Dependencies diff --git a/src/sinks/topsql_data_deltalake/processor.rs b/src/sinks/topsql_data_deltalake/processor.rs index 653186d..a08e0df 100644 --- a/src/sinks/topsql_data_deltalake/processor.rs +++ b/src/sinks/topsql_data_deltalake/processor.rs @@ -519,24 +519,23 @@ impl TopSQLDeltaLakeSink { } fn build_table_path(&self, table_name: &str, route: Option<&KeyspaceRoute>) -> PathBuf { + let (table_type, table_instance) = Self::table_partition_values(table_name); + let mut segments = Vec::new(); if let Some(route) = route { segments.push(format!("org={}", route.org_id)); segments.push(format!("cluster={}", route.cluster_id)); } - segments.extend(Self::table_partition_segments(table_name)); + segments.push(format!("type=topsql_{}", table_type)); + segments.push(format!("instance={}", table_instance)); let segment_refs: Vec<&str> = segments.iter().map(|segment| segment.as_str()).collect(); Self::join_path(&self.base_path, &segment_refs) } - fn table_partition_segments(table_name: &str) -> Vec { + fn table_partition_values(table_name: &str) -> (&str, &str) { if table_name == SOURCE_TABLE_TOPRU { - vec![ - "type=topsql".to_string(), - "component=topru".to_string(), - "instance=default".to_string(), - ] + ("topru", "default") } else { match table_name .strip_prefix("topsql_") @@ -545,20 +544,14 @@ impl TopSQLDeltaLakeSink { Some((table_type, table_instance)) if !table_type.is_empty() && !table_instance.is_empty() => { - vec![ - format!("type=topsql_{}", table_type), - format!("instance={}", table_instance), - ] + (table_type, table_instance) } _ => { error!( "Unexpected table_name format (expected `topsql_{{type}}_{{instance}}` or `topsql_topru`): {}", table_name ); - vec![ - "type=topsql_unknown".to_string(), - "instance=unknown".to_string(), - ] + ("unknown", "unknown") } } } @@ -784,37 +777,7 @@ mod tests { assert_eq!( table_path, - PathBuf::from("/tmp/deltalake/type=topsql/component=topru/instance=default") - ); - } - - #[test] - fn test_build_topru_table_path_with_meta_route_uses_component_partition() { - let (sink, _) = TopSQLDeltaLakeSink::new_for_test( - PathBuf::from("s3://o11y-prod-shared-us-west-2-premium/deltalake"), - vec![], - WriteConfig { - batch_size: 1, - timeout_secs: 0, - }, - 180, - None, - None, - ); - - let table_path = sink.build_table_path( - "topsql_topru", - Some(&KeyspaceRoute { - org_id: "1369847559692509642".to_string(), - cluster_id: "10110362358366286743".to_string(), - }), - ); - - assert_eq!( - table_path, - PathBuf::from( - "s3://o11y-prod-shared-us-west-2-premium/deltalake/org=1369847559692509642/cluster=10110362358366286743/type=topsql/component=topru/instance=default" - ) + PathBuf::from("/tmp/deltalake/type=topsql_topru/instance=default") ); } From f6c4670e854ee3742b0640cb594c2f8146220045 Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Tue, 24 Mar 2026 11:01:55 +0800 Subject: [PATCH 14/26] topsql: restore component-based data path layout --- src/sinks/topsql_data_deltalake/arch.md | 1 + src/sinks/topsql_data_deltalake/processor.rs | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/sinks/topsql_data_deltalake/arch.md b/src/sinks/topsql_data_deltalake/arch.md index 1a77bdf..38d96ca 100644 --- a/src/sinks/topsql_data_deltalake/arch.md +++ b/src/sinks/topsql_data_deltalake/arch.md @@ -57,6 +57,7 @@ pub struct TopSQLDataDeltaLakeConfig { - **Time Partitioning**: Partition by execution time - **Schema Optimization**: Optimized schema for TopSQL data - **Keyspace-based Routing**: Optional PD keyspace lookup can prepend `org=/cluster=` path segments before the table layout, which is especially useful for `topru` data written to shared S3 prefixes +- **Component-based Path Layout**: TopSQL data is partitioned by `component=` and `instance=` ## Dependencies diff --git a/src/sinks/topsql_data_deltalake/processor.rs b/src/sinks/topsql_data_deltalake/processor.rs index a08e0df..110500a 100644 --- a/src/sinks/topsql_data_deltalake/processor.rs +++ b/src/sinks/topsql_data_deltalake/processor.rs @@ -526,7 +526,7 @@ impl TopSQLDeltaLakeSink { segments.push(format!("org={}", route.org_id)); segments.push(format!("cluster={}", route.cluster_id)); } - segments.push(format!("type=topsql_{}", table_type)); + segments.push(format!("component={}", table_type)); segments.push(format!("instance={}", table_instance)); let segment_refs: Vec<&str> = segments.iter().map(|segment| segment.as_str()).collect(); @@ -754,7 +754,7 @@ mod tests { assert_eq!( table_path, PathBuf::from( - "s3://o11y-prod-shared-us-west-2-premium/deltalake/org=1369847559692509642/cluster=10110362358366286743/type=topsql_tidb/instance=127.0.0.1:10080" + "s3://o11y-prod-shared-us-west-2-premium/deltalake/org=1369847559692509642/cluster=10110362358366286743/component=tidb/instance=127.0.0.1:10080" ) ); } @@ -777,7 +777,7 @@ mod tests { assert_eq!( table_path, - PathBuf::from("/tmp/deltalake/type=topsql_topru/instance=default") + PathBuf::from("/tmp/deltalake/component=topru/instance=default") ); } From 54c1fda92122aaef912ffce41a27854c8e4314dd Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Tue, 24 Mar 2026 17:42:01 +0800 Subject: [PATCH 15/26] topru: address remaining review feedback --- src/common/topology/fetch/mod.rs | 12 + src/common/topology/fetch/tidb_manager.rs | 28 +- src/sinks/topsql_data_deltalake/processor.rs | 43 +- src/sinks/topsql_meta_deltalake/processor.rs | 455 +++++++++---------- src/sources/topsql_v2/mod.rs | 4 +- 5 files changed, 264 insertions(+), 278 deletions(-) diff --git a/src/common/topology/fetch/mod.rs b/src/common/topology/fetch/mod.rs index 2c739a2..6d442d8 100644 --- a/src/common/topology/fetch/mod.rs +++ b/src/common/topology/fetch/mod.rs @@ -46,6 +46,8 @@ pub enum FetchError { FetchTiDBTopology { source: tidb::FetchError }, #[snafu(display("Failed to fetch tidb topology from manager server: {}", source))] FetchTiDBFromManagerServerTopology { source: tidb_manager::FetchError }, + #[snafu(display("Failed to read manager shard config: {}", source))] + ReadManagerShardConfig { source: tidb_manager::FetchError }, #[snafu(display("Failed to fetch store topology: {}", source))] FetchStoreTopology { source: store::FetchError }, #[snafu(display("Failed to fetch tidb nextgen topology: {}", source))] @@ -61,6 +63,7 @@ pub struct LegacyTopologyFetcher { pd_address: String, manager_server_address: Option, tidb_namespace: Option, + manager_shard_config: Option, http_client: HttpClient, pub etcd_client: etcd_client::Client, } @@ -79,6 +82,13 @@ impl LegacyTopologyFetcher { .transpose()?; let tidb_namespace = Self::normalize_tidb_namespace(manager_server_address.as_deref(), tidb_namespace)?; + let manager_shard_config = if manager_server_address.is_some() { + // Shard env vars are process-scoped, so we parse them once during fetcher init. + tidb_manager::read_manager_shard_config_from_env() + .context(ReadManagerShardConfigSnafu)? + } else { + None + }; let http_client = Self::build_http_client(tls_config.as_ref(), proxy_config)?; let etcd_client = Self::build_etcd_client(&pd_address, &tls_config).await?; @@ -86,6 +96,7 @@ impl LegacyTopologyFetcher { pd_address, manager_server_address, tidb_namespace, + manager_shard_config, http_client, etcd_client, }) @@ -104,6 +115,7 @@ impl LegacyTopologyFetcher { manager_server_address, self.tidb_namespace.as_deref(), &self.http_client, + self.manager_shard_config, ) .get_up_tidbs(components) .await diff --git a/src/common/topology/fetch/tidb_manager.rs b/src/common/topology/fetch/tidb_manager.rs index 7596c92..add5828 100644 --- a/src/common/topology/fetch/tidb_manager.rs +++ b/src/common/topology/fetch/tidb_manager.rs @@ -48,7 +48,7 @@ struct ActiveTiDBAddress { } #[derive(Debug, Clone, Copy, Eq, PartialEq)] -struct ManagerShardConfig { +pub(super) struct ManagerShardConfig { replica_count: u64, sts_id: u64, } @@ -57,6 +57,7 @@ pub struct TiDBManagerTopologyFetcher<'a> { manager_server_address: &'a str, tidb_namespace: Option<&'a str>, http_client: &'a HttpClient, + shard_config: Option, } impl<'a> TiDBManagerTopologyFetcher<'a> { @@ -64,11 +65,13 @@ impl<'a> TiDBManagerTopologyFetcher<'a> { manager_server_address: &'a str, tidb_namespace: Option<&'a str>, http_client: &'a HttpClient, + shard_config: Option, ) -> Self { Self { manager_server_address, tidb_namespace, http_client, + shard_config, } } @@ -76,10 +79,9 @@ impl<'a> TiDBManagerTopologyFetcher<'a> { &self, components: &mut HashSet, ) -> Result<(), FetchError> { - let shard_config = Self::read_manager_shard_config_from_env()?; let active_tidb_addresses = Self::filter_active_tidb_addresses( self.fetch_active_tidb_addresses().await?, - shard_config, + self.shard_config, )?; if active_tidb_addresses.is_empty() { info!( @@ -87,7 +89,7 @@ impl<'a> TiDBManagerTopologyFetcher<'a> { manager_server_address = self.manager_server_address, tidb_namespace = ?self.tidb_namespace, tidb_count = 0, - shard_config = ?shard_config + shard_config = ?self.shard_config ); return Ok(()); } @@ -289,13 +291,6 @@ impl<'a> TiDBManagerTopologyFetcher<'a> { }) } - fn read_manager_shard_config_from_env() -> Result, FetchError> { - ManagerShardConfig::from_env_values( - env::var(VECTOR_STS_REPLICA_COUNT_ENV).ok().as_deref(), - env::var(VECTOR_STS_ID_ENV).ok().as_deref(), - ) - } - fn filter_active_tidb_addresses( active_tidb_addresses: Vec, shard_config: Option, @@ -347,6 +342,9 @@ impl<'a> TiDBManagerTopologyFetcher<'a> { } fn hash_keyspace_name(keyspace_name: &str) -> u64 { + // FNV-1a keeps sharding deterministic across process restarts and languages. + // We intentionally avoid std::hash because it is not stable across runs, and + // any service that shards keyspaces the same way must reuse this exact contract. let mut hash = FNV1A_64_OFFSET_BASIS; for byte in keyspace_name.as_bytes() { hash ^= u64::from(*byte); @@ -381,6 +379,14 @@ impl<'a> TiDBManagerTopologyFetcher<'a> { } } +pub(super) fn read_manager_shard_config_from_env() -> Result, FetchError> +{ + ManagerShardConfig::from_env_values( + env::var(VECTOR_STS_REPLICA_COUNT_ENV).ok().as_deref(), + env::var(VECTOR_STS_ID_ENV).ok().as_deref(), + ) +} + impl ManagerShardConfig { fn from_env_values( replica_count: Option<&str>, diff --git a/src/sinks/topsql_data_deltalake/processor.rs b/src/sinks/topsql_data_deltalake/processor.rs index 110500a..863fa9d 100644 --- a/src/sinks/topsql_data_deltalake/processor.rs +++ b/src/sinks/topsql_data_deltalake/processor.rs @@ -251,6 +251,7 @@ lazy_static! { const ROUTE_RESOLUTION_RETRY_DELAY: Duration = Duration::from_secs(5); /// Delta Lake sink processor +#[derive(Clone)] pub struct TopSQLDeltaLakeSink { base_path: PathBuf, tables: Vec, @@ -278,12 +279,8 @@ impl TopSQLDeltaLakeSink { storage_options: Option>, keyspace_route_resolver: Option, ) -> Self { - // Create a channel with capacity 1 let (tx, rx) = mpsc::channel(1); - let tx = Arc::new(tx); - - // Create sink instance - let sink = Arc::new(Self { + let sink = Self { base_path, tables, write_config, @@ -291,41 +288,13 @@ impl TopSQLDeltaLakeSink { storage_options, keyspace_route_resolver, writers: Arc::new(Mutex::new(HashMap::new())), - tx: Arc::clone(&tx), - }); - - // Spawn process_events_loop as a separate tokio task to avoid blocking - let sink_clone = Arc::clone(&sink); + tx: Arc::new(tx), + }; + let sink_clone = sink.clone(); tokio::spawn(async move { sink_clone.process_events_loop(rx).await; }); - - // Return the sink (Arc::try_unwrap will fail because tokio task holds a reference, - // so we use unsafe to manually get the inner value without decrementing the reference count) - // Safety: We know there's exactly one more reference (the tokio task), - // but we need to return Self, not Arc. The tokio task will continue - // to hold its reference, which is safe because TopSQLDeltaLakeSink contains - // only Arc and atomic types that are safe to share. - // We use into_raw to get a raw pointer, then manually reconstruct the value. - unsafe { - let ptr = Arc::into_raw(sink); - // Get a reference to the inner value - let inner_ref = &*ptr; - // Clone the value (TopSQLDeltaLakeSink contains only Arc and atomic types, so cloning is safe) - let inner_value = TopSQLDeltaLakeSink { - base_path: inner_ref.base_path.clone(), - tables: inner_ref.tables.clone(), - write_config: inner_ref.write_config.clone(), - max_delay_secs: inner_ref.max_delay_secs, - storage_options: inner_ref.storage_options.clone(), - keyspace_route_resolver: inner_ref.keyspace_route_resolver.clone(), - writers: Arc::clone(&inner_ref.writers), - tx: Arc::clone(&inner_ref.tx), - }; - // Reconstruct the Arc (so the tokio task's reference remains valid) - let _ = Arc::from_raw(ptr); - inner_value - } + sink } #[cfg(test)] diff --git a/src/sinks/topsql_meta_deltalake/processor.rs b/src/sinks/topsql_meta_deltalake/processor.rs index f14ec1a..9aa9e8a 100644 --- a/src/sinks/topsql_meta_deltalake/processor.rs +++ b/src/sinks/topsql_meta_deltalake/processor.rs @@ -5,16 +5,16 @@ use std::time::{Duration, Instant}; use futures::{stream::BoxStream, StreamExt}; use lru::LruCache; -use tokio::sync::Mutex; use tokio::sync::mpsc; +use tokio::sync::Mutex; use vector_lib::event::Event; use vector_lib::sink::StreamSink; use crate::common::deltalake_writer::{DeltaLakeWriter, DeltaTableConfig, WriteConfig}; use crate::sources::topsql_v2::upstream::consts::{ - LABEL_PLAN_DIGEST, LABEL_SQL_DIGEST, LABEL_NORMALIZED_SQL, - LABEL_DATE, LABEL_ENCODED_NORMALIZED_PLAN, LABEL_NORMALIZED_PLAN, - LABEL_SOURCE_TABLE, SOURCE_TABLE_TOPSQL_SQL_META, SOURCE_TABLE_TOPSQL_PLAN_META, + LABEL_DATE, LABEL_ENCODED_NORMALIZED_PLAN, LABEL_NORMALIZED_PLAN, LABEL_NORMALIZED_SQL, + LABEL_PLAN_DIGEST, LABEL_SOURCE_TABLE, LABEL_SQL_DIGEST, SOURCE_TABLE_TOPSQL_PLAN_META, + SOURCE_TABLE_TOPSQL_SQL_META, }; use lazy_static::lazy_static; @@ -27,7 +27,7 @@ lazy_static! { "mysql_type": "text", "is_nullable": false }), - ); + ); schema_info.insert( LABEL_SQL_DIGEST.into(), serde_json::json!({ @@ -57,7 +57,7 @@ lazy_static! { "mysql_type": "text", "is_nullable": false }), - ); + ); schema_info.insert( LABEL_PLAN_DIGEST.into(), serde_json::json!({ @@ -85,13 +85,14 @@ lazy_static! { serde_json::json!(vec![LABEL_DATE.to_string()]), ); schema_info - }; + }; } /// When buffer size exceeds this value, events will be flushed const EVENT_BUFFER_MAX_SIZE: usize = 1000; /// Delta Lake sink processor +#[derive(Clone)] pub struct TopSQLDeltaLakeSink { base_path: PathBuf, tables: Vec, @@ -120,62 +121,31 @@ impl TopSQLDeltaLakeSink { storage_options: Option>, meta_cache_capacity: usize, ) -> Self { - // Create a channel with capacity 1 let (tx, rx) = mpsc::channel(1); - let tx = Arc::new(tx); - - // Create sink instance - let sink = Arc::new(Self { + let sink = Self { base_path, tables, write_config, max_delay_secs, storage_options, writers: Arc::new(Mutex::new(HashMap::new())), - tx: Arc::clone(&tx), - seen_keys_sql_meta: Arc::new(Mutex::new(LruCache::new(std::num::NonZeroUsize::new(meta_cache_capacity).unwrap()))), // LRU cache with configurable capacity - seen_keys_plan_meta: Arc::new(Mutex::new(LruCache::new(std::num::NonZeroUsize::new(meta_cache_capacity).unwrap()))), // LRU cache with configurable capacity + tx: Arc::new(tx), + seen_keys_sql_meta: Arc::new(Mutex::new(LruCache::new( + std::num::NonZeroUsize::new(meta_cache_capacity).unwrap(), + ))), // LRU cache with configurable capacity + seen_keys_plan_meta: Arc::new(Mutex::new(LruCache::new( + std::num::NonZeroUsize::new(meta_cache_capacity).unwrap(), + ))), // LRU cache with configurable capacity new_event_buffer: Arc::new(Mutex::new(Vec::new())), last_flush_time: Arc::new(Mutex::new(Instant::now())), - }); - - // Spawn process_events_loop as a separate tokio task to avoid blocking - let sink_clone = Arc::clone(&sink); + }; + let sink_clone = sink.clone(); tokio::spawn(async move { sink_clone.process_events_loop(rx).await; }); - - // Return the sink (Arc::try_unwrap will fail because tokio task holds a reference, - // so we use unsafe to manually get the inner value without decrementing the reference count) - // Safety: We know there's exactly one more reference (the tokio task), - // but we need to return Self, not Arc. The tokio task will continue - // to hold its reference, which is safe because TopSQLDeltaLakeSink contains - // only Arc and atomic types that are safe to share. - // We use into_raw to get a raw pointer, then manually reconstruct the value. - unsafe { - let ptr = Arc::into_raw(sink); - // Get a reference to the inner value - let inner_ref = &*ptr; - // Clone the value (TopSQLDeltaLakeSink contains only Arc and atomic types, so cloning is safe) - let inner_value = TopSQLDeltaLakeSink { - base_path: inner_ref.base_path.clone(), - tables: inner_ref.tables.clone(), - write_config: inner_ref.write_config.clone(), - max_delay_secs: inner_ref.max_delay_secs, - storage_options: inner_ref.storage_options.clone(), - writers: Arc::clone(&inner_ref.writers), - tx: Arc::clone(&inner_ref.tx), - seen_keys_sql_meta: Arc::clone(&inner_ref.seen_keys_sql_meta), - seen_keys_plan_meta: Arc::clone(&inner_ref.seen_keys_plan_meta), - new_event_buffer: Arc::clone(&inner_ref.new_event_buffer), - last_flush_time: Arc::clone(&inner_ref.last_flush_time), - }; - // Reconstruct the Arc (so the tokio task's reference remains valid) - let _ = Arc::from_raw(ptr); - inner_value - } + sink } - + #[cfg(test)] /// Create a new Delta Lake sink for testing, returning both the sink and the receiver /// The receiver can be used to verify messages sent through the channel @@ -189,9 +159,12 @@ impl TopSQLDeltaLakeSink { meta_cache_capacity: usize, ) -> (Self, mpsc::Receiver>>) { // Create a channel with capacity 1 - let (tx, rx): (mpsc::Sender>>, mpsc::Receiver>>) = mpsc::channel(1); + let (tx, rx): ( + mpsc::Sender>>, + mpsc::Receiver>>, + ) = mpsc::channel(1); let tx = Arc::new(tx); - + // Create sink instance (without starting process_events_loop) let sink = Self { base_path, @@ -201,21 +174,22 @@ impl TopSQLDeltaLakeSink { storage_options, writers: Arc::new(Mutex::new(HashMap::new())), tx, - seen_keys_sql_meta: Arc::new(Mutex::new(LruCache::new(std::num::NonZeroUsize::new(meta_cache_capacity).unwrap()))), // LRU cache with configurable capacity - seen_keys_plan_meta: Arc::new(Mutex::new(LruCache::new(std::num::NonZeroUsize::new(meta_cache_capacity).unwrap()))), // LRU cache with configurable capacity + seen_keys_sql_meta: Arc::new(Mutex::new(LruCache::new( + std::num::NonZeroUsize::new(meta_cache_capacity).unwrap(), + ))), // LRU cache with configurable capacity + seen_keys_plan_meta: Arc::new(Mutex::new(LruCache::new( + std::num::NonZeroUsize::new(meta_cache_capacity).unwrap(), + ))), // LRU cache with configurable capacity new_event_buffer: Arc::new(Mutex::new(Vec::new())), last_flush_time: Arc::new(Mutex::new(Instant::now())), }; - + // Return the sink and receiver for testing (sink, rx) } /// Process events from channel and write to Delta Lake - async fn process_events_loop( - &self, - mut rx: mpsc::Receiver>>, - ) { + async fn process_events_loop(&self, mut rx: mpsc::Receiver>>) { while let Some(events_vec) = rx.recv().await { if let Err(e) = self.process_events(events_vec).await { error!("Failed to process events: {}", e); @@ -227,31 +201,40 @@ impl TopSQLDeltaLakeSink { /// Returns (table_name, key) if key can be extracted, None otherwise /// table_name is the value of LABEL_SOURCE_TABLE (e.g., SOURCE_TABLE_TOPSQL_SQL_META or SOURCE_TABLE_TOPSQL_PLAN_META) /// key format: digest_date (e.g., sql_digest_2024-01-01) - fn extract_event_key(&self, log_event: &vector_lib::event::LogEvent) -> Option<(String, String)> { + fn extract_event_key( + &self, + log_event: &vector_lib::event::LogEvent, + ) -> Option<(String, String)> { // Get table_name from source_table - let table_name = log_event.get(LABEL_SOURCE_TABLE) + let table_name = log_event + .get(LABEL_SOURCE_TABLE) .and_then(|v| v.as_str()) .map(|s| s.to_string())?; // Get date from log_event - let date = log_event.get(LABEL_DATE) + let date = log_event + .get(LABEL_DATE) .and_then(|v| v.as_str()) .map(|s| s.to_string())?; // Extract key based on source_table type if table_name == SOURCE_TABLE_TOPSQL_SQL_META { // For SQL meta: use sql_digest_date format - if let Some(sql_digest) = log_event.get(LABEL_SQL_DIGEST) + if let Some(sql_digest) = log_event + .get(LABEL_SQL_DIGEST) .and_then(|v| v.as_str()) - .map(|s| s.to_string()) { + .map(|s| s.to_string()) + { let key = format!("{}_{}", sql_digest, date); return Some((table_name, key)); } } else if table_name == SOURCE_TABLE_TOPSQL_PLAN_META { // For PLAN meta: use plan_digest_date format - if let Some(plan_digest) = log_event.get(LABEL_PLAN_DIGEST) + if let Some(plan_digest) = log_event + .get(LABEL_PLAN_DIGEST) .and_then(|v| v.as_str()) - .map(|s| s.to_string()) { + .map(|s| s.to_string()) + { let key = format!("{}_{}", plan_digest, date); return Some((table_name, key)); } @@ -272,7 +255,8 @@ impl TopSQLDeltaLakeSink { let mut table_events: HashMap> = HashMap::new(); for event in buffer.drain(..) { if let Event::Log(log_event) = event { - let table_name = log_event.get(LABEL_SOURCE_TABLE) + let table_name = log_event + .get(LABEL_SOURCE_TABLE) .and_then(|v| v.as_str()) .map(|s| s.to_string()); if let Some(table_name) = table_name { @@ -366,7 +350,7 @@ impl TopSQLDeltaLakeSink { if buffer_full || time_reached { // Release buffer lock before flushing drop(buffer); - + // Flush buffer to deltalake self.flush_buffer().await?; } @@ -381,7 +365,7 @@ impl TopSQLDeltaLakeSink { } let first_event = &mut events[0]; let log = first_event.as_mut_log(); - + // Select schema based on table_name (which is actually source_table) let schema = match table_name { SOURCE_TABLE_TOPSQL_SQL_META => &*SQL_META_SCHEMA, @@ -391,7 +375,7 @@ impl TopSQLDeltaLakeSink { return; // Return early if table_name doesn't match any known type } }; - + log.insert( "_schema_metadata", serde_json::Value::Object(schema.clone()), @@ -482,13 +466,15 @@ impl StreamSink for TopSQLDeltaLakeSink { events_cache.push(events); // Allow max delay to configured value, continue if not ready to send - if events_count + cur_cached_size < sink.write_config.batch_size - && latest_timestamp < oldest_timestamp + sink.max_delay_secs as i64 { + if events_count + cur_cached_size < sink.write_config.batch_size + && latest_timestamp < oldest_timestamp + sink.max_delay_secs as i64 + { continue; } // Send events to process_events through channel - let should_drop_on_full = latest_timestamp >= oldest_timestamp + sink.max_delay_secs as i64; + let should_drop_on_full = + latest_timestamp >= oldest_timestamp + sink.max_delay_secs as i64; match tx.try_send(events_cache) { Ok(_) => { // Successfully sent, clear the cache @@ -515,7 +501,7 @@ impl StreamSink for TopSQLDeltaLakeSink { } } } - + // When the input stream ends, try to send any remaining cached events if !events_cache.is_empty() { // Send remaining events, wait if channel is full @@ -524,7 +510,7 @@ impl StreamSink for TopSQLDeltaLakeSink { error!("Channel closed when flushing remaining events, dropping events"); } } - + // Note: We don't drop tx here as it's owned by the sink and may be used by other run() calls // The channel will be closed when the sink is dropped Ok(()) @@ -546,7 +532,9 @@ mod tests { event } - fn create_test_sink_with_receiver(batch_size: usize) -> (TopSQLDeltaLakeSink, mpsc::Receiver>>) { + fn create_test_sink_with_receiver( + batch_size: usize, + ) -> (TopSQLDeltaLakeSink, mpsc::Receiver>>) { TopSQLDeltaLakeSink::new_for_test( PathBuf::from("/tmp/test"), vec![], @@ -564,45 +552,47 @@ mod tests { async fn test_send_when_batch_size_reached() { let batch_size = 5; let (sink, mut rx) = create_test_sink_with_receiver(batch_size); - + // Create events that will reach batch size let events: Vec = (0..batch_size) .map(|i| create_test_event(1000 + i as i64)) .collect(); - + let input_stream = stream::iter(events.clone()).boxed(); let sink_box = Box::new(sink); - + // Run the function in a task - let run_handle = tokio::spawn(async move { - sink_box.run(input_stream).await - }); - + let run_handle = tokio::spawn(async move { sink_box.run(input_stream).await }); + // Wait a bit for the message to be sent tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; - + // Verify that a message was sent through the channel - let received = tokio::time::timeout( - tokio::time::Duration::from_millis(500), - rx.recv() - ).await; - + let received = + tokio::time::timeout(tokio::time::Duration::from_millis(500), rx.recv()).await; + assert!(received.is_ok(), "Should receive a message from channel"); if let Ok(Some(events_vec)) = received { // Verify the message content // Count total events let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); - assert_eq!(total_events, batch_size, "Should receive exactly batch_size events"); - + assert_eq!( + total_events, batch_size, + "Should receive exactly batch_size events" + ); + // Verify event structure assert!(!events_vec.is_empty(), "Events vector should not be empty"); for event_batch in &events_vec { - assert!(!event_batch.is_empty(), "Each event batch should not be empty"); + assert!( + !event_batch.is_empty(), + "Each event batch should not be empty" + ); } } else { panic!("Failed to receive message from channel"); } - + // Wait for run to complete let _ = run_handle.await; } @@ -611,44 +601,43 @@ mod tests { async fn test_send_when_timeout_reached() { let batch_size = 100; // Large batch size so we don't reach it let (sink, mut rx) = create_test_sink_with_receiver(batch_size); - + // Create events with timestamps that exceed timeout (180 seconds) let oldest_ts = 1000; let latest_ts = oldest_ts + 181; // Exceeds 180 second timeout - + // Create two events: one at the start, one after timeout - let events = vec![ - create_test_event(oldest_ts), - create_test_event(latest_ts), - ]; - + let events = vec![create_test_event(oldest_ts), create_test_event(latest_ts)]; + let input_stream = stream::iter(events.clone()).boxed(); let sink_box = Box::new(sink); - + // Run the function in a task - let run_handle = tokio::spawn(async move { - sink_box.run(input_stream).await - }); - + let run_handle = tokio::spawn(async move { sink_box.run(input_stream).await }); + // Wait a bit for the message to be sent tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; - + // Verify that a message was sent through the channel due to timeout - let received = tokio::time::timeout( - tokio::time::Duration::from_millis(500), - rx.recv() - ).await; - - assert!(received.is_ok(), "Should receive a message from channel due to timeout"); + let received = + tokio::time::timeout(tokio::time::Duration::from_millis(500), rx.recv()).await; + + assert!( + received.is_ok(), + "Should receive a message from channel due to timeout" + ); if let Ok(Some(events_vec)) = received { // Verify the message content // Verify events were sent let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); - assert_eq!(total_events, 2, "Should receive both events (oldest and latest)"); + assert_eq!( + total_events, 2, + "Should receive both events (oldest and latest)" + ); } else { panic!("Failed to receive message from channel"); } - + // Wait for run to complete let _ = run_handle.await; } @@ -657,55 +646,60 @@ mod tests { async fn test_channel_full_keep_cache_when_not_timeout() { let batch_size = 5; let (sink, mut rx) = create_test_sink_with_receiver(batch_size); - + // Create many events to fill the channel (capacity 1) // The first batch will fill the channel, second batch should be kept in cache // and retried later let events: Vec = (0..batch_size * 2) .map(|i| create_test_event(1000 + i as i64)) // All within timeout window .collect(); - + let input_stream = stream::iter(events.clone()).boxed(); let sink_box = Box::new(sink); - + // Run the function in a task - let run_handle = tokio::spawn(async move { - sink_box.run(input_stream).await - }); - + let run_handle = tokio::spawn(async move { sink_box.run(input_stream).await }); + // Don't consume from rx immediately to fill the channel // Wait a bit for the first message to be sent // The channel should be full now, and subsequent sends should keep data in cache // Since we're not consuming, the channel stays full // After a bit more time, the run should complete tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; - + // Now consume the first message let first_msg = rx.recv().await; assert!(first_msg.is_some(), "Should receive first message"); if let Some(events_vec) = first_msg { // Verify first message content let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); - assert_eq!(total_events, batch_size, "First message should contain batch_size events"); + assert_eq!( + total_events, batch_size, + "First message should contain batch_size events" + ); } - + // Wait a bit more - the second batch should be sent after channel has space tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; - + // Check if second message was sent (data was kept in cache and retried) - let second_msg = tokio::time::timeout( - tokio::time::Duration::from_millis(200), - rx.recv() - ).await; - + let second_msg = + tokio::time::timeout(tokio::time::Duration::from_millis(200), rx.recv()).await; + // The second batch should eventually be sent (kept in cache and retried) - assert!(second_msg.is_ok(), "Should eventually receive second message after retry"); + assert!( + second_msg.is_ok(), + "Should eventually receive second message after retry" + ); if let Ok(Some(events_vec)) = second_msg { // Verify second message content let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); - assert_eq!(total_events, batch_size, "Second message should contain batch_size events"); + assert_eq!( + total_events, batch_size, + "Second message should contain batch_size events" + ); } - + // Wait for run to complete let _ = run_handle.await; } @@ -714,7 +708,7 @@ mod tests { async fn test_channel_full_drop_when_timeout() { let batch_size = 5; let (sink, mut rx) = create_test_sink_with_receiver(batch_size); - + // Create events with timeout: first batch, then events after timeout let mut events = vec![]; // First batch at timestamp 1000 @@ -726,54 +720,59 @@ mod tests { events.push(create_test_event(1005 + i as i64)); } events.push(create_test_event(1186)); - + let input_stream = stream::iter(events.clone()).boxed(); let sink_box = Box::new(sink); - + // Run the function in a task - let run_handle = tokio::spawn(async move { - sink_box.run(input_stream).await - }); - + let run_handle = tokio::spawn(async move { sink_box.run(input_stream).await }); + // Don't consume from rx to fill the channel // Wait for first message to be sent // Channel should be full now // When the timeout event arrives and channel is full, data should be dropped tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; - + // Consume the first message let first_msg = rx.recv().await; assert!(first_msg.is_some(), "Should receive first message"); if let Some(events_vec) = first_msg { // Verify first message content let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); - assert_eq!(total_events, batch_size, "First message should contain batch_size events"); - + assert_eq!( + total_events, batch_size, + "First message should contain batch_size events" + ); + // Verify timestamps are from the first batch (1000-1004) for event_batch in &events_vec { for event in event_batch { if let Event::Log(ref log_event) = event { - if let Some(timestamp) = log_event.get("timestamps").and_then(|v| v.as_integer()) { - assert!(timestamp >= 1000 && timestamp < 1000 + batch_size as i64, - "First message should contain events from first batch"); + if let Some(timestamp) = + log_event.get("timestamps").and_then(|v| v.as_integer()) + { + assert!( + timestamp >= 1000 && timestamp < 1000 + batch_size as i64, + "First message should contain events from first batch" + ); } } } } } - + // Wait a bit more - the timeout event should have been dropped, not sent tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; - + // Check if a second message was sent (it shouldn't be, as data was dropped) - let second_msg = tokio::time::timeout( - tokio::time::Duration::from_millis(200), - rx.recv() - ).await; + let second_msg = + tokio::time::timeout(tokio::time::Duration::from_millis(200), rx.recv()).await; // The second message should NOT be sent because data was dropped due to timeout - assert!(second_msg.is_err() || second_msg.unwrap().is_none(), - "Should NOT receive second message as data was dropped due to timeout"); - + assert!( + second_msg.is_err() || second_msg.unwrap().is_none(), + "Should NOT receive second message as data was dropped due to timeout" + ); + // Wait for run to complete let _ = run_handle.await; } @@ -782,41 +781,38 @@ mod tests { async fn test_not_send_when_batch_size_and_timeout_not_reached() { let batch_size = 10; let (sink, mut rx) = create_test_sink_with_receiver(batch_size); - + // Create events that don't reach batch size and don't timeout - let events: Vec = (0..3) - .map(|i| create_test_event(1000 + i)) - .collect(); - + let events: Vec = (0..3).map(|i| create_test_event(1000 + i)).collect(); + let input_stream = stream::iter(events.clone()).boxed(); let sink_box = Box::new(sink); - + // Run the function in a task - let run_handle = tokio::spawn(async move { - sink_box.run(input_stream).await - }); - + let run_handle = tokio::spawn(async move { sink_box.run(input_stream).await }); + // Wait for run to complete let result = run_handle.await; assert!(result.is_ok()); assert!(result.unwrap().is_ok()); - + // Verify that no message was sent (data doesn't meet send conditions) // Note: When stream ends, remaining data might be flushed, but with only 3 events // and batch_size 10, and no timeout, it should not send immediately // However, when the stream ends, the loop exits and remaining cache might be sent // Let's check if any message was received - let received = tokio::time::timeout( - tokio::time::Duration::from_millis(200), - rx.recv() - ).await; - + let received = + tokio::time::timeout(tokio::time::Duration::from_millis(200), rx.recv()).await; + // With the current implementation, when stream ends, remaining cache might be sent // So we check if a message was received and verify its content if let Ok(Some(events_vec)) = received { // Verify the message content let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); - assert_eq!(total_events, 3, "Should receive the 3 events that were cached"); + assert_eq!( + total_events, 3, + "Should receive the 3 events that were cached" + ); } else { // If no message was received, that's also valid - data wasn't sent // This depends on implementation details of when remaining cache is flushed @@ -827,41 +823,42 @@ mod tests { async fn test_batch_size_sending_behavior() { let batch_size = 3; let (sink, mut rx) = create_test_sink_with_receiver(batch_size); - + // Create exactly batch_size events let events: Vec = (0..batch_size) .map(|i| create_test_event(1000 + i as i64)) .collect(); - + let input_stream = stream::iter(events.clone()).boxed(); let sink_box = Box::new(sink); - + // Run the function in a task - let run_handle = tokio::spawn(async move { - sink_box.run(input_stream).await - }); - + let run_handle = tokio::spawn(async move { sink_box.run(input_stream).await }); + // Wait a bit for the message to be sent tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; - + // Verify that a message was sent through the channel - let received = tokio::time::timeout( - tokio::time::Duration::from_millis(500), - rx.recv() - ).await; - + let received = + tokio::time::timeout(tokio::time::Duration::from_millis(500), rx.recv()).await; + assert!(received.is_ok(), "Should receive a message from channel"); if let Ok(Some(events_vec)) = received { // Verify the message content // Count total events let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); - assert_eq!(total_events, batch_size, "Should receive exactly batch_size events"); - + assert_eq!( + total_events, batch_size, + "Should receive exactly batch_size events" + ); + // Verify event timestamps for event_batch in events_vec { for (i, event) in event_batch.iter().enumerate() { if let Event::Log(ref log_event) = event { - if let Some(timestamp) = log_event.get("timestamps").and_then(|v| v.as_integer()) { + if let Some(timestamp) = + log_event.get("timestamps").and_then(|v| v.as_integer()) + { assert_eq!(timestamp, 1000 + i as i64, "Event timestamp should match"); } } @@ -870,7 +867,7 @@ mod tests { } else { panic!("Failed to receive message from channel"); } - + // Wait for run to complete let _ = run_handle.await; } @@ -879,56 +876,58 @@ mod tests { async fn test_timeout_sending_behavior() { let batch_size = 100; // Large batch size let (sink, mut rx) = create_test_sink_with_receiver(batch_size); - + // Create events with large time gap (exceeding 180 seconds) let oldest_ts = 1000; let latest_ts = 1181; // 181 seconds later, exceeds timeout - let events = vec![ - create_test_event(oldest_ts), - create_test_event(latest_ts), - ]; - + let events = vec![create_test_event(oldest_ts), create_test_event(latest_ts)]; + let input_stream = stream::iter(events.clone()).boxed(); let sink_box = Box::new(sink); - + // Run the function in a task - let run_handle = tokio::spawn(async move { - sink_box.run(input_stream).await - }); - + let run_handle = tokio::spawn(async move { sink_box.run(input_stream).await }); + // Wait a bit for the message to be sent tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; - + // Verify that a message was sent through the channel due to timeout - let received = tokio::time::timeout( - tokio::time::Duration::from_millis(500), - rx.recv() - ).await; - - assert!(received.is_ok(), "Should receive a message from channel due to timeout"); + let received = + tokio::time::timeout(tokio::time::Duration::from_millis(500), rx.recv()).await; + + assert!( + received.is_ok(), + "Should receive a message from channel due to timeout" + ); if let Ok(Some(events_vec)) = received { // Verify the message content // Count total events let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); assert_eq!(total_events, 2, "Should receive both events"); - + // Verify event timestamps let mut timestamps = Vec::new(); for event_batch in &events_vec { for event in event_batch { if let Event::Log(ref log_event) = event { - if let Some(timestamp) = log_event.get("timestamps").and_then(|v| v.as_integer()) { + if let Some(timestamp) = + log_event.get("timestamps").and_then(|v| v.as_integer()) + { timestamps.push(timestamp); } } } } timestamps.sort(); - assert_eq!(timestamps, vec![oldest_ts, latest_ts], "Should receive events with correct timestamps"); + assert_eq!( + timestamps, + vec![oldest_ts, latest_ts], + "Should receive events with correct timestamps" + ); } else { panic!("Failed to receive message from channel"); } - + // Wait for run to complete let _ = run_handle.await; } @@ -937,51 +936,51 @@ mod tests { async fn test_multiple_batches() { let batch_size = 3; let (sink, mut rx) = create_test_sink_with_receiver(batch_size); - + // Create multiple batches worth of events let total_events = batch_size * 3; let events: Vec = (0..total_events) .map(|i| create_test_event(1000 + i as i64)) .collect(); - + let input_stream = stream::iter(events.clone()).boxed(); let sink_box = Box::new(sink); - + // Run the function in a task - let run_handle = tokio::spawn(async move { - sink_box.run(input_stream).await - }); - + let run_handle = tokio::spawn(async move { sink_box.run(input_stream).await }); + // Collect all messages from the channel let mut received_messages = Vec::new(); let expected_batches = (total_events + batch_size - 1) / batch_size; // Ceiling division - + // Wait for all batches to be sent for _ in 0..expected_batches { - let received = tokio::time::timeout( - tokio::time::Duration::from_millis(500), - rx.recv() - ).await; + let received = + tokio::time::timeout(tokio::time::Duration::from_millis(500), rx.recv()).await; if let Ok(Some(msg)) = received { received_messages.push(msg); } else { break; } } - + // Verify we received the expected number of batches assert!(received_messages.len() >= 1); // Verify total events received - let total_received: usize = received_messages.iter() + let total_received: usize = received_messages + .iter() .map(|events_vec| events_vec.iter().map(|v| v.len()).sum::()) .sum(); - assert_eq!(total_received, total_events, "Should receive all events across batches"); - + assert_eq!( + total_received, total_events, + "Should receive all events across batches" + ); + // Verify each message for events_vec in &received_messages { assert!(!events_vec.is_empty(), "Each batch should contain events"); } - + // Wait for run to complete let _ = run_handle.await; } diff --git a/src/sources/topsql_v2/mod.rs b/src/sources/topsql_v2/mod.rs index 6508829..10b8c69 100644 --- a/src/sources/topsql_v2/mod.rs +++ b/src/sources/topsql_v2/mod.rs @@ -68,10 +68,10 @@ pub struct TopSQLConfig { /// PLACEHOLDER pub pd_address: Option, - /// PLACEHOLDER + /// Optional manager endpoint used to fetch active TiDB instances instead of reading TiDB topology from etcd. pub manager_server_address: Option, - /// PLACEHOLDER + /// Comma-separated namespace list sent to `/api/tidb/get_active_tidb?namespace=...`; required when `manager_server_address` is set. pub tidb_namespace: Option, /// PLACEHOLDER From 17a2eb7f62c1a38f81d2f9d5892e1bfe129e67a7 Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Tue, 24 Mar 2026 17:49:12 +0800 Subject: [PATCH 16/26] topru: finish remaining review fixes --- src/common/keyspace_cluster.rs | 35 +++++++++++++++++- src/common/topology/fetch/mod.rs | 27 ++++++++------ src/common/topology/fetch/tidb_manager.rs | 15 ++------ src/sinks/topsql_data_deltalake/processor.rs | 39 +++++++++++++++++++- 4 files changed, 88 insertions(+), 28 deletions(-) diff --git a/src/common/keyspace_cluster.rs b/src/common/keyspace_cluster.rs index 83befa2..62a424d 100644 --- a/src/common/keyspace_cluster.rs +++ b/src/common/keyspace_cluster.rs @@ -7,6 +7,7 @@ use std::time::Duration; use lru::LruCache; use reqwest::{Certificate, Client, Identity, StatusCode}; use serde::Deserialize; +use serde_json::Value; use tokio::sync::Mutex; use url::form_urlencoded::byte_serialize; use vector_lib::tls::TlsConfig; @@ -174,8 +175,30 @@ fn normalize_pd_address(pd_address: &str, use_tls: bool) -> String { } fn is_not_found_body(body: &str) -> bool { - let lower = body.to_ascii_lowercase(); - lower.contains("not found") + if body.to_ascii_lowercase().contains("keyspace not found") { + return true; + } + + let Ok(value) = serde_json::from_str::(body) else { + return false; + }; + + extract_error_message(&value) + .map(|message| message.to_ascii_lowercase().contains("keyspace not found")) + .unwrap_or(false) +} + +fn extract_error_message(value: &Value) -> Option<&str> { + value + .get("message") + .and_then(Value::as_str) + .or_else(|| value.get("error").and_then(Value::as_str)) + .or_else(|| { + value + .get("error") + .and_then(|error| error.get("message")) + .and_then(Value::as_str) + }) } fn extract_route_from_config(config: &HashMap) -> Option { @@ -251,6 +274,14 @@ mod tests { assert_eq!(extract_route_from_config(&legacy_config), None); } + #[test] + fn is_not_found_body_only_matches_keyspace_errors() { + assert!(is_not_found_body("keyspace not found")); + assert!(is_not_found_body(r#"{"message":"keyspace not found"}"#)); + assert!(!is_not_found_body("certificate not found")); + assert!(!is_not_found_body(r#"{"message":"PD server not found"}"#)); + } + #[tokio::test] async fn resolve_keyspace_uses_pd_keyspace_api_and_caches_result() { let request_count = Arc::new(AtomicUsize::new(0)); diff --git a/src/common/topology/fetch/mod.rs b/src/common/topology/fetch/mod.rs index 6d442d8..bd807d6 100644 --- a/src/common/topology/fetch/mod.rs +++ b/src/common/topology/fetch/mod.rs @@ -22,6 +22,20 @@ use vector::config::ProxyConfig; use vector::http::HttpClient; use vector::tls::{MaybeTlsSettings, TlsConfig}; +pub(super) fn normalize_namespace_list(namespaces: Option<&str>) -> Option { + let namespaces = namespaces?; + let normalized = namespaces + .split(',') + .map(str::trim) + .filter(|namespace| !namespace.is_empty()) + .collect::>(); + if normalized.is_empty() { + None + } else { + Some(normalized.join(",")) + } +} + #[derive(Debug, Snafu)] pub enum FetchError { #[snafu(display("Failed to build TLS settings: {}", source))] @@ -155,18 +169,7 @@ impl LegacyTopologyFetcher { manager_server_address: Option<&str>, tidb_namespace: Option, ) -> Result, FetchError> { - let tidb_namespace = tidb_namespace.and_then(|namespaces| { - let normalized = namespaces - .split(',') - .map(str::trim) - .filter(|namespace| !namespace.is_empty()) - .collect::>(); - if normalized.is_empty() { - None - } else { - Some(normalized.join(",")) - } - }); + let tidb_namespace = normalize_namespace_list(tidb_namespace.as_deref()); if manager_server_address.is_some() && tidb_namespace.is_none() { return Err(FetchError::ConfigurationError { diff --git a/src/common/topology/fetch/tidb_manager.rs b/src/common/topology/fetch/tidb_manager.rs index add5828..60e2817 100644 --- a/src/common/topology/fetch/tidb_manager.rs +++ b/src/common/topology/fetch/tidb_manager.rs @@ -5,6 +5,7 @@ use serde_json::{Map, Value}; use snafu::{ResultExt, Snafu}; use vector::http::HttpClient; +use super::normalize_namespace_list; use crate::common::topology::fetch::utils; use crate::common::topology::{Component, InstanceType}; @@ -138,7 +139,7 @@ impl<'a> TiDBManagerTopologyFetcher<'a> { } fn active_tidb_endpoint_url(&self) -> Option { - let namespaces = Self::normalize_namespaces(self.tidb_namespace)?; + let namespaces = normalize_namespace_list(self.tidb_namespace)?; Some(Self::build_active_tidb_endpoint_url( self.manager_server_address, &namespaces, @@ -146,17 +147,7 @@ impl<'a> TiDBManagerTopologyFetcher<'a> { } fn normalize_namespaces(namespaces: Option<&str>) -> Option { - let namespaces = namespaces?; - let normalized = namespaces - .split(',') - .map(str::trim) - .filter(|ns| !ns.is_empty()) - .collect::>(); - if normalized.is_empty() { - None - } else { - Some(normalized.join(",")) - } + normalize_namespace_list(namespaces) } fn build_active_tidb_endpoint_url(manager_server_address: &str, namespaces: &str) -> String { diff --git a/src/sinks/topsql_data_deltalake/processor.rs b/src/sinks/topsql_data_deltalake/processor.rs index 863fa9d..1ea2d9a 100644 --- a/src/sinks/topsql_data_deltalake/processor.rs +++ b/src/sinks/topsql_data_deltalake/processor.rs @@ -249,6 +249,8 @@ lazy_static! { } const ROUTE_RESOLUTION_RETRY_DELAY: Duration = Duration::from_secs(5); +const MAX_ROUTE_RESOLUTION_RETRIES: usize = 5; +const MAX_ROUTE_RESOLUTION_RETRY_DELAY: Duration = Duration::from_secs(60); /// Delta Lake sink processor #[derive(Clone)] @@ -337,17 +339,33 @@ impl TopSQLDeltaLakeSink { while let Some(events_vec) = rx.recv().await { let retry_on_failure = self.keyspace_route_resolver.is_some(); let mut pending_events = events_vec; + let mut retry_count = 0usize; loop { - let retry_snapshot = retry_on_failure.then(|| pending_events.clone()); + let can_retry = retry_on_failure && retry_count < MAX_ROUTE_RESOLUTION_RETRIES; + let retry_snapshot = can_retry.then(|| pending_events.clone()); match self.process_events(pending_events).await { Ok(()) => break, Err(error) => { error!("Failed to process events: {}", error); let Some(events) = retry_snapshot else { + if retry_on_failure { + error!( + "Dropping event batch after {} route-resolution retries", + retry_count + ); + } break; }; - tokio::time::sleep(ROUTE_RESOLUTION_RETRY_DELAY).await; + retry_count += 1; + let retry_delay = route_resolution_retry_delay(retry_count); + warn!( + "Retrying event batch after route-resolution failure (attempt {}/{}, delay {:?})", + retry_count, + MAX_ROUTE_RESOLUTION_RETRIES, + retry_delay + ); + tokio::time::sleep(retry_delay).await; pending_events = events; } } @@ -579,6 +597,15 @@ impl TopSQLDeltaLakeSink { } } +fn route_resolution_retry_delay(retry_count: usize) -> Duration { + let multiplier = 1u64 << retry_count.saturating_sub(1).min(6); + let delay_secs = ROUTE_RESOLUTION_RETRY_DELAY + .as_secs() + .saturating_mul(multiplier) + .min(MAX_ROUTE_RESOLUTION_RETRY_DELAY.as_secs()); + Duration::from_secs(delay_secs) +} + #[async_trait::async_trait] impl StreamSink for TopSQLDeltaLakeSink { async fn run(self: Box, input: BoxStream<'_, Event>) -> Result<(), ()> { @@ -698,6 +725,14 @@ mod tests { ) } + #[test] + fn test_route_resolution_retry_delay_caps_at_maximum() { + assert_eq!(route_resolution_retry_delay(1), Duration::from_secs(5)); + assert_eq!(route_resolution_retry_delay(2), Duration::from_secs(10)); + assert_eq!(route_resolution_retry_delay(5), Duration::from_secs(60)); + assert_eq!(route_resolution_retry_delay(8), Duration::from_secs(60)); + } + #[test] fn test_build_table_path_with_meta_route_for_s3() { let (sink, _) = TopSQLDeltaLakeSink::new_for_test( From c2126654ea263987235a2c69db7b00d9d1f67359 Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Wed, 25 Mar 2026 13:25:19 +0800 Subject: [PATCH 17/26] Add keyspace routing for TopSQL meta events --- src/sinks/topsql_meta_deltalake/arch.md | 4 +- src/sinks/topsql_meta_deltalake/mod.rs | 41 +- src/sinks/topsql_meta_deltalake/processor.rs | 328 +++++++++++-- src/sources/topsql_v2/arch.md | 10 + src/sources/topsql_v2/upstream/tidb/parser.rs | 453 +++++++++++++----- 5 files changed, 662 insertions(+), 174 deletions(-) diff --git a/src/sinks/topsql_meta_deltalake/arch.md b/src/sinks/topsql_meta_deltalake/arch.md index 3983c09..f1e5d66 100644 --- a/src/sinks/topsql_meta_deltalake/arch.md +++ b/src/sinks/topsql_meta_deltalake/arch.md @@ -49,13 +49,15 @@ pub struct TopSQLMetaDeltaLakeConfig { 2. **Metadata Transformation**: Transform metadata format 3. **Schema Management**: Handle metadata schema 4. **Delta Lake Writing**: Write using deltalake_writer -5. **Partitioning**: Partition by metadata type +5. **Keyspace Routing**: Optionally resolve `keyspace -> org/cluster` via PD before choosing the output path +6. **Partitioning**: Partition by metadata type ## TopSQL Metadata Features - **Schema Storage**: Store SQL schemas - **Query Plan Storage**: Store query execution plans - **Metadata Versioning**: Track metadata changes over time +- **Keyspace-aware Layout**: When `enable_keyspace_cluster_mapping = true`, metadata is written under `org=/cluster=/component=topsql_{sql_meta|plan_meta}` ## Dependencies diff --git a/src/sinks/topsql_meta_deltalake/mod.rs b/src/sinks/topsql_meta_deltalake/mod.rs index 6b332a3..eb7d2e0 100644 --- a/src/sinks/topsql_meta_deltalake/mod.rs +++ b/src/sinks/topsql_meta_deltalake/mod.rs @@ -25,8 +25,13 @@ use tracing::{error, info, warn}; mod processor; // Import default functions from common module -use crate::common::deltalake_writer::{default_batch_size, default_timeout_secs}; use crate::common::deltalake_s3; +use crate::common::deltalake_writer::{default_batch_size, default_timeout_secs}; +use crate::common::keyspace_cluster::PdKeyspaceResolver; + +pub const fn default_enable_keyspace_cluster_mapping() -> bool { + false +} pub const fn default_max_delay_secs() -> u64 { 180 @@ -63,6 +68,16 @@ pub struct DeltaLakeConfig { #[serde(default = "default_meta_cache_capacity")] pub meta_cache_capacity: usize, + /// Whether to resolve keyspace to org/cluster path segments through PD. + #[serde(default = "default_enable_keyspace_cluster_mapping")] + pub enable_keyspace_cluster_mapping: bool, + + /// PD address used to resolve keyspace to org/cluster path segments. + pub pd_address: Option, + + /// TLS configuration for PD keyspace lookup. + pub pd_tls: Option, + /// Storage options for cloud storage pub storage_options: Option>, @@ -109,6 +124,9 @@ impl GenerateConfig for DeltaLakeConfig { timeout_secs: default_timeout_secs(), max_delay_secs: default_max_delay_secs(), meta_cache_capacity: default_meta_cache_capacity(), + enable_keyspace_cluster_mapping: default_enable_keyspace_cluster_mapping(), + pd_address: None, + pd_tls: None, storage_options: None, bucket: None, options: None, @@ -228,6 +246,24 @@ impl DeltaLakeConfig { info!("No S3 service available - using default storage options only"); } + let keyspace_route_resolver = if self.enable_keyspace_cluster_mapping { + let pd_address = self.pd_address.as_deref().ok_or_else(|| { + vector::Error::from( + "pd_address is required when enable_keyspace_cluster_mapping is true", + ) + })?; + Some( + PdKeyspaceResolver::new(pd_address, self.pd_tls.clone()).map_err(|error| { + vector::Error::from(format!( + "failed to build PD keyspace resolver from pd_address: {}", + error + )) + })?, + ) + } else { + None + }; + let sink = TopSQLDeltaLakeSink::new( base_path, table_configs, @@ -235,6 +271,7 @@ impl DeltaLakeConfig { self.max_delay_secs, Some(storage_options), self.meta_cache_capacity, + keyspace_route_resolver, ); Ok(VectorSink::from_event_streamsink(sink)) @@ -282,4 +319,4 @@ mod tests { fn generate_config() { vector::test_util::test_generate_config::(); } -} \ No newline at end of file +} diff --git a/src/sinks/topsql_meta_deltalake/processor.rs b/src/sinks/topsql_meta_deltalake/processor.rs index 9aa9e8a..e1d5d9d 100644 --- a/src/sinks/topsql_meta_deltalake/processor.rs +++ b/src/sinks/topsql_meta_deltalake/processor.rs @@ -7,14 +7,15 @@ use futures::{stream::BoxStream, StreamExt}; use lru::LruCache; use tokio::sync::mpsc; use tokio::sync::Mutex; -use vector_lib::event::Event; +use vector_lib::event::{Event, LogEvent}; use vector_lib::sink::StreamSink; use crate::common::deltalake_writer::{DeltaLakeWriter, DeltaTableConfig, WriteConfig}; +use crate::common::keyspace_cluster::{KeyspaceRoute, PdKeyspaceResolver}; use crate::sources::topsql_v2::upstream::consts::{ - LABEL_DATE, LABEL_ENCODED_NORMALIZED_PLAN, LABEL_NORMALIZED_PLAN, LABEL_NORMALIZED_SQL, - LABEL_PLAN_DIGEST, LABEL_SOURCE_TABLE, LABEL_SQL_DIGEST, SOURCE_TABLE_TOPSQL_PLAN_META, - SOURCE_TABLE_TOPSQL_SQL_META, + LABEL_DATE, LABEL_ENCODED_NORMALIZED_PLAN, LABEL_KEYSPACE, LABEL_NORMALIZED_PLAN, + LABEL_NORMALIZED_SQL, LABEL_PLAN_DIGEST, LABEL_SOURCE_TABLE, LABEL_SQL_DIGEST, + SOURCE_TABLE_TOPSQL_PLAN_META, SOURCE_TABLE_TOPSQL_SQL_META, }; use lazy_static::lazy_static; @@ -35,6 +36,13 @@ lazy_static! { "is_nullable": true }), ); + schema_info.insert( + LABEL_KEYSPACE.into(), + serde_json::json!({ + "mysql_type": "text", + "is_nullable": true + }), + ); schema_info.insert( LABEL_NORMALIZED_SQL.into(), serde_json::json!({ @@ -65,6 +73,13 @@ lazy_static! { "is_nullable": true }), ); + schema_info.insert( + LABEL_KEYSPACE.into(), + serde_json::json!({ + "mysql_type": "text", + "is_nullable": true + }), + ); schema_info.insert( LABEL_NORMALIZED_PLAN.into(), serde_json::json!({ @@ -90,6 +105,9 @@ lazy_static! { /// When buffer size exceeds this value, events will be flushed const EVENT_BUFFER_MAX_SIZE: usize = 1000; +const ROUTE_RESOLUTION_RETRY_DELAY: Duration = Duration::from_secs(5); +const MAX_ROUTE_RESOLUTION_RETRIES: usize = 5; +const MAX_ROUTE_RESOLUTION_RETRY_DELAY: Duration = Duration::from_secs(60); /// Delta Lake sink processor #[derive(Clone)] @@ -99,7 +117,8 @@ pub struct TopSQLDeltaLakeSink { write_config: WriteConfig, max_delay_secs: u64, storage_options: Option>, - writers: Arc>>, + keyspace_route_resolver: Option, + writers: Arc>>, tx: Arc>>>, // LRU cache for SQL meta deduplication: key -> () seen_keys_sql_meta: Arc>>, @@ -111,6 +130,12 @@ pub struct TopSQLDeltaLakeSink { last_flush_time: Arc>, } +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +struct WriterKey { + table_name: String, + table_path: PathBuf, +} + impl TopSQLDeltaLakeSink { /// Create a new Delta Lake sink pub fn new( @@ -120,6 +145,7 @@ impl TopSQLDeltaLakeSink { max_delay_secs: u64, storage_options: Option>, meta_cache_capacity: usize, + keyspace_route_resolver: Option, ) -> Self { let (tx, rx) = mpsc::channel(1); let sink = Self { @@ -128,6 +154,7 @@ impl TopSQLDeltaLakeSink { write_config, max_delay_secs, storage_options, + keyspace_route_resolver, writers: Arc::new(Mutex::new(HashMap::new())), tx: Arc::new(tx), seen_keys_sql_meta: Arc::new(Mutex::new(LruCache::new( @@ -157,6 +184,7 @@ impl TopSQLDeltaLakeSink { max_delay_secs: u64, storage_options: Option>, meta_cache_capacity: usize, + keyspace_route_resolver: Option, ) -> (Self, mpsc::Receiver>>) { // Create a channel with capacity 1 let (tx, rx): ( @@ -172,6 +200,7 @@ impl TopSQLDeltaLakeSink { write_config, max_delay_secs, storage_options, + keyspace_route_resolver, writers: Arc::new(Mutex::new(HashMap::new())), tx, seen_keys_sql_meta: Arc::new(Mutex::new(LruCache::new( @@ -191,8 +220,38 @@ impl TopSQLDeltaLakeSink { /// Process events from channel and write to Delta Lake async fn process_events_loop(&self, mut rx: mpsc::Receiver>>) { while let Some(events_vec) = rx.recv().await { - if let Err(e) = self.process_events(events_vec).await { - error!("Failed to process events: {}", e); + let retry_on_failure = self.keyspace_route_resolver.is_some(); + let mut pending_events = events_vec; + let mut retry_count = 0usize; + + loop { + let can_retry = retry_on_failure && retry_count < MAX_ROUTE_RESOLUTION_RETRIES; + let retry_snapshot = can_retry.then(|| pending_events.clone()); + match self.process_events(pending_events).await { + Ok(()) => break, + Err(error) => { + error!("Failed to process events: {}", error); + let Some(events) = retry_snapshot else { + if retry_on_failure { + error!( + "Dropping meta event batch after {} route-resolution retries", + retry_count + ); + } + break; + }; + retry_count += 1; + let retry_delay = route_resolution_retry_delay(retry_count); + warn!( + "Retrying meta event batch after route-resolution failure (attempt {}/{}, delay {:?})", + retry_count, + MAX_ROUTE_RESOLUTION_RETRIES, + retry_delay + ); + tokio::time::sleep(retry_delay).await; + pending_events = events; + } + } } } } @@ -217,6 +276,11 @@ impl TopSQLDeltaLakeSink { .and_then(|v| v.as_str()) .map(|s| s.to_string())?; + let keyspace = log_event + .get(LABEL_KEYSPACE) + .and_then(|v| v.as_str()) + .map(|s| s.to_string()); + // Extract key based on source_table type if table_name == SOURCE_TABLE_TOPSQL_SQL_META { // For SQL meta: use sql_digest_date format @@ -225,7 +289,10 @@ impl TopSQLDeltaLakeSink { .and_then(|v| v.as_str()) .map(|s| s.to_string()) { - let key = format!("{}_{}", sql_digest, date); + let key = match keyspace.as_deref() { + Some(keyspace) => format!("{}_{}_{}", keyspace, sql_digest, date), + None => format!("{}_{}", sql_digest, date), + }; return Some((table_name, key)); } } else if table_name == SOURCE_TABLE_TOPSQL_PLAN_META { @@ -235,7 +302,10 @@ impl TopSQLDeltaLakeSink { .and_then(|v| v.as_str()) .map(|s| s.to_string()) { - let key = format!("{}_{}", plan_digest, date); + let key = match keyspace.as_deref() { + Some(keyspace) => format!("{}_{}_{}", keyspace, plan_digest, date), + None => format!("{}_{}", plan_digest, date), + }; return Some((table_name, key)); } } @@ -251,17 +321,17 @@ impl TopSQLDeltaLakeSink { return Ok(()); } - // Group events by table_name - let mut table_events: HashMap> = HashMap::new(); + // Group events by writer target so each org/cluster route gets its own table. + let mut table_events: HashMap> = HashMap::new(); + let mut resolved_routes: HashMap> = HashMap::new(); for event in buffer.drain(..) { if let Event::Log(log_event) = event { - let table_name = log_event - .get(LABEL_SOURCE_TABLE) - .and_then(|v| v.as_str()) - .map(|s| s.to_string()); - if let Some(table_name) = table_name { + if let Some(writer_key) = self + .resolve_writer_key(&log_event, &mut resolved_routes) + .await? + { table_events - .entry(table_name) + .entry(writer_key) .or_insert_with(Vec::new) .push(Event::Log(log_event)); } @@ -269,9 +339,9 @@ impl TopSQLDeltaLakeSink { } // Write table's events - for (table_name, mut events) in table_events { - self.add_schema_info(&table_name, &mut events); - if let Err(e) = self.write_table_events(&table_name, events).await { + for (writer_key, mut events) in table_events { + self.add_schema_info(&writer_key.table_name, &mut events); + if let Err(e) = self.write_table_events(&writer_key, events).await { let error_msg = e.to_string(); if error_msg.contains("log segment") || error_msg.contains("Invalid table version") @@ -280,10 +350,15 @@ impl TopSQLDeltaLakeSink { { panic!( "Delta Lake corruption detected for table {}: {}", - table_name, error_msg + writer_key.table_name, error_msg ); } else { - error!("Failed to write events to table {}: {}", table_name, e); + error!( + "Failed to write events to table {} at {}: {}", + writer_key.table_name, + writer_key.table_path.display(), + e + ); } } } @@ -358,6 +433,92 @@ impl TopSQLDeltaLakeSink { Ok(()) } + async fn resolve_writer_key( + &self, + log_event: &LogEvent, + resolved_routes: &mut HashMap>, + ) -> Result, Box> { + let Some(table_name) = log_event + .get(LABEL_SOURCE_TABLE) + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + else { + return Ok(None); + }; + let route = self + .resolve_keyspace_route(log_event, resolved_routes) + .await?; + if self.keyspace_route_resolver.is_some() && route.is_none() { + return Ok(None); + } + Ok(Some(WriterKey { + table_name: table_name.clone(), + table_path: self.build_table_path(&table_name, route.as_ref()), + })) + } + + async fn resolve_keyspace_route( + &self, + log_event: &LogEvent, + resolved_routes: &mut HashMap>, + ) -> Result, Box> { + let Some(resolver) = self.keyspace_route_resolver.as_ref() else { + return Ok(None); + }; + let Some(keyspace) = log_event + .get(LABEL_KEYSPACE) + .and_then(|value| value.as_str()) + else { + return Ok(None); + }; + + if let Some(route) = resolved_routes.get(keyspace.as_ref()) { + return Ok(route.clone()); + } + + let route = resolver.resolve_keyspace(keyspace.as_ref()).await?; + resolved_routes.insert(keyspace.to_string(), route.clone()); + if route.is_none() { + warn!( + "No cluster route found for keyspace {}, skipping TopSQL meta event", + keyspace + ); + } + Ok(route) + } + + fn build_table_path(&self, table_name: &str, route: Option<&KeyspaceRoute>) -> PathBuf { + let mut segments = Vec::new(); + if let Some(route) = route { + segments.push(format!("org={}", route.org_id)); + segments.push(format!("cluster={}", route.cluster_id)); + } + segments.push(format!("component={}", table_name)); + + let segment_refs: Vec<&str> = segments.iter().map(|segment| segment.as_str()).collect(); + Self::join_path(&self.base_path, &segment_refs) + } + + fn join_path(base_path: &PathBuf, segments: &[&str]) -> PathBuf { + if base_path.to_string_lossy().starts_with("s3://") { + let mut path = base_path + .to_string_lossy() + .trim_end_matches('/') + .to_string(); + for segment in segments { + path.push('/'); + path.push_str(segment); + } + PathBuf::from(path) + } else { + let mut path = base_path.clone(); + for segment in segments { + path = path.join(segment); + } + path + } + } + /// Write events to a specific table fn add_schema_info(&self, table_name: &str, events: &mut Vec) { if events.is_empty() { @@ -385,35 +546,23 @@ impl TopSQLDeltaLakeSink { /// Write events to a specific table async fn write_table_events( &self, - table_name: &str, + writer_key: &WriterKey, events: Vec, ) -> Result<(), Box> { // Get or create writer for this table let mut writers = self.writers.lock().await; - let writer = writers.entry(table_name.to_string()).or_insert_with(|| { - let table_path = if self.base_path.to_string_lossy().starts_with("s3://") { - // For S3 paths, append the table name to the S3 path - PathBuf::from(format!( - "{}/component={}", - self.base_path.to_string_lossy(), - table_name - )) - } else { - // For local paths, use join as before - self.base_path.join(format!("component={}", table_name)) - }; - + let writer = writers.entry(writer_key.clone()).or_insert_with(|| { let table_config = self .tables .iter() - .find(|t| t.name == table_name) + .find(|t| t.name == writer_key.table_name) .cloned() .unwrap_or_else(|| DeltaTableConfig { - name: table_name.to_string(), + name: writer_key.table_name.clone(), schema_evolution: Some(true), }); DeltaLakeWriter::new_with_options( - table_path, + writer_key.table_path.clone(), table_config, self.write_config.clone(), self.storage_options.clone(), @@ -428,6 +577,15 @@ impl TopSQLDeltaLakeSink { } } +fn route_resolution_retry_delay(retry_count: usize) -> Duration { + let multiplier = 1u64 << retry_count.saturating_sub(1).min(6); + let delay_secs = ROUTE_RESOLUTION_RETRY_DELAY + .as_secs() + .saturating_mul(multiplier) + .min(MAX_ROUTE_RESOLUTION_RETRY_DELAY.as_secs()); + Duration::from_secs(delay_secs) +} + #[async_trait::async_trait] impl StreamSink for TopSQLDeltaLakeSink { async fn run(self: Box, input: BoxStream<'_, Event>) -> Result<(), ()> { @@ -545,9 +703,101 @@ mod tests { 180, // Use default value for tests None, 10000, // Use default LRU cache capacity for tests + None, ) } + fn create_meta_event( + source_table: &str, + digest_field: &str, + digest: &str, + keyspace: Option<&str>, + ) -> LogEvent { + let mut log = LogEvent::default(); + log.insert(LABEL_SOURCE_TABLE, source_table); + log.insert(LABEL_DATE, "2026-03-25"); + log.insert(digest_field, digest); + if let Some(keyspace) = keyspace { + log.insert(LABEL_KEYSPACE, keyspace); + } + log + } + + #[test] + fn test_extract_event_key_includes_keyspace() { + let (sink, _) = create_test_sink_with_receiver(1); + let sql_meta_event = create_meta_event( + SOURCE_TABLE_TOPSQL_SQL_META, + LABEL_SQL_DIGEST, + "SQL_DIGEST", + Some("ks-a"), + ); + let plan_meta_event = create_meta_event( + SOURCE_TABLE_TOPSQL_PLAN_META, + LABEL_PLAN_DIGEST, + "PLAN_DIGEST", + Some("ks-b"), + ); + + assert_eq!( + sink.extract_event_key(&sql_meta_event), + Some(( + SOURCE_TABLE_TOPSQL_SQL_META.to_string(), + "ks-a_SQL_DIGEST_2026-03-25".to_string(), + )) + ); + assert_eq!( + sink.extract_event_key(&plan_meta_event), + Some(( + SOURCE_TABLE_TOPSQL_PLAN_META.to_string(), + "ks-b_PLAN_DIGEST_2026-03-25".to_string(), + )) + ); + } + + #[test] + fn test_build_table_path_with_keyspace_route_for_s3() { + let (sink, _) = TopSQLDeltaLakeSink::new_for_test( + PathBuf::from("s3://o11y-prod-shared-us-west-2-premium/deltalake"), + vec![], + WriteConfig { + batch_size: 1, + timeout_secs: 0, + }, + 180, + None, + 10000, + None, + ); + + let table_path = sink.build_table_path( + SOURCE_TABLE_TOPSQL_SQL_META, + Some(&KeyspaceRoute { + org_id: "30018".to_string(), + cluster_id: "10155668891296301432".to_string(), + }), + ); + + assert_eq!( + table_path, + PathBuf::from( + "s3://o11y-prod-shared-us-west-2-premium/deltalake/org=30018/cluster=10155668891296301432/component=topsql_sql_meta" + ) + ); + } + + #[test] + fn test_build_table_path_without_keyspace_route_preserves_existing_layout() { + let (sink, _) = create_test_sink_with_receiver(1); + + let table_path = sink.build_table_path(SOURCE_TABLE_TOPSQL_PLAN_META, None); + + assert_eq!( + table_path, + PathBuf::from("/tmp/test/component=topsql_plan_meta") + ); + } + #[tokio::test] async fn test_send_when_batch_size_reached() { let batch_size = 5; diff --git a/src/sources/topsql_v2/arch.md b/src/sources/topsql_v2/arch.md index 86c0182..c59f101 100644 --- a/src/sources/topsql_v2/arch.md +++ b/src/sources/topsql_v2/arch.md @@ -52,11 +52,21 @@ Legacy mode discovery options: - `manager_server_address`: optional manager endpoint used to fetch active TiDB instances - `tidb_namespace`: manager namespace list used when calling `/api/tidb/get_active_tidb` - `enable_tikv_topsql`: whether to collect `tikv_topsql` and `tikv_topregion`; defaults to `true` +- `topru`: TiDB TopRU subscription options (`enable`, `report_interval_seconds`, `item_interval_seconds`) ## Data Flow Same as TopSQL v1 but with improved reliability and performance. +TiDB subscriptions emit four log-event families: + +- `tidb_topsql`: execution metrics keyed by SQL/plan digest +- `topsql_topru`: RU metrics keyed by SQL/plan digest and user +- `topsql_sql_meta`: normalized SQL text +- `topsql_plan_meta`: normalized / encoded plan text + +For TiDB-originated events, `keyspace` is propagated when the upstream payload includes `keyspace_name`, including `topsql_sql_meta` and `topsql_plan_meta`. + ## Dependencies - Same as TopSQL v1 diff --git a/src/sources/topsql_v2/upstream/tidb/parser.rs b/src/sources/topsql_v2/upstream/tidb/parser.rs index 50d1aba..84f2c9d 100644 --- a/src/sources/topsql_v2/upstream/tidb/parser.rs +++ b/src/sources/topsql_v2/upstream/tidb/parser.rs @@ -1,24 +1,25 @@ use std::collections::{BTreeMap, HashMap}; use std::sync::Arc; -use chrono::Utc; -use vector::event::Event; -use vector_lib::event::{LogEvent, Value as LogValue}; use crate::sources::topsql_v2::schema_cache::SchemaCache; use crate::sources::topsql_v2::upstream::consts::{ - LABEL_DATE, LABEL_ENCODED_NORMALIZED_PLAN, LABEL_INSTANCE_KEY, - LABEL_NORMALIZED_PLAN, LABEL_NORMALIZED_SQL, LABEL_PLAN_DIGEST, - LABEL_SQL_DIGEST, LABEL_SOURCE_TABLE, LABEL_TIMESTAMPS, LABEL_KEYSPACE, LABEL_USER, - METRIC_NAME_CPU_TIME_MS, METRIC_NAME_NETWORK_IN_BYTES, METRIC_NAME_NETWORK_OUT_BYTES, - METRIC_NAME_STMT_DURATION_COUNT, METRIC_NAME_STMT_DURATION_SUM_NS, METRIC_NAME_STMT_EXEC_COUNT, - METRIC_NAME_TOTAL_RU, METRIC_NAME_EXEC_COUNT, METRIC_NAME_EXEC_DURATION, - SOURCE_TABLE_TIDB_TOPSQL, SOURCE_TABLE_TOPSQL_PLAN_META, SOURCE_TABLE_TOPSQL_SQL_META, SOURCE_TABLE_TOPRU, + LABEL_DATE, LABEL_ENCODED_NORMALIZED_PLAN, LABEL_INSTANCE_KEY, LABEL_KEYSPACE, + LABEL_NORMALIZED_PLAN, LABEL_NORMALIZED_SQL, LABEL_PLAN_DIGEST, LABEL_SOURCE_TABLE, + LABEL_SQL_DIGEST, LABEL_TIMESTAMPS, LABEL_USER, METRIC_NAME_CPU_TIME_MS, + METRIC_NAME_EXEC_COUNT, METRIC_NAME_EXEC_DURATION, METRIC_NAME_NETWORK_IN_BYTES, + METRIC_NAME_NETWORK_OUT_BYTES, METRIC_NAME_STMT_DURATION_COUNT, + METRIC_NAME_STMT_DURATION_SUM_NS, METRIC_NAME_STMT_EXEC_COUNT, METRIC_NAME_TOTAL_RU, + SOURCE_TABLE_TIDB_TOPSQL, SOURCE_TABLE_TOPRU, SOURCE_TABLE_TOPSQL_PLAN_META, + SOURCE_TABLE_TOPSQL_SQL_META, }; use crate::sources::topsql_v2::upstream::parser::UpstreamEventParser; use crate::sources::topsql_v2::upstream::tidb::proto::top_sql_sub_response::RespOneof; use crate::sources::topsql_v2::upstream::tidb::proto::{ PlanMeta, SqlMeta, TopSqlRecord, TopSqlRecordItem, TopSqlSubResponse, }; +use chrono::Utc; +use vector::event::Event; +use vector_lib::event::{LogEvent, Value as LogValue}; pub struct TopSqlSubResponseParser; @@ -31,9 +32,7 @@ impl UpstreamEventParser for TopSqlSubResponseParser { _schema_cache: Arc, ) -> Vec { match response.resp_oneof { - Some(RespOneof::Record(record)) => { - Self::parse_tidb_record(record, instance) - } + Some(RespOneof::Record(record)) => Self::parse_tidb_record(record, instance), Some(RespOneof::SqlMeta(sql_meta)) => Self::parse_tidb_sql_meta(sql_meta), Some(RespOneof::PlanMeta(plan_meta)) => Self::parse_tidb_plan_meta(plan_meta), Some(RespOneof::RuRecord(ru_record)) => Self::parse_top_ru_record(ru_record), @@ -103,14 +102,15 @@ impl UpstreamEventParser for TopSqlSubResponseParser { let mut cpu_values: Vec = v.iter().map(|psd| psd.cpu_time_ms).collect(); cpu_values.select_nth_unstable_by(top_n, |a, b| b.cmp(a)); let cpu_threshold = cpu_values[top_n]; - + // Find top_n threshold for network bytes using partial selection - let mut network_values: Vec = v.iter() + let mut network_values: Vec = v + .iter() .map(|psd| psd.stmt_network_in_bytes + psd.stmt_network_out_bytes) .collect(); network_values.select_nth_unstable_by(top_n, |a, b| b.cmp(a)); let network_threshold = network_values[top_n]; - + // Keep records that meet either threshold let mut kept = Vec::new(); for psd in v.iter() { @@ -132,7 +132,7 @@ impl UpstreamEventParser for TopSqlSubResponseParser { others.stmt_network_out_bytes += psd.stmt_network_out_bytes; } } - + *v = kept; } @@ -215,16 +215,19 @@ impl UpstreamEventParser for TopSqlSubResponseParser { } impl TopSqlSubResponseParser { - fn parse_tidb_record( - record: TopSqlRecord, - instance: String, - ) -> Vec { - let mut keyspace_name_str = "".to_string(); - if !record.keyspace_name.is_empty() { - if let Ok(ks) = String::from_utf8(record.keyspace_name.clone()) { - keyspace_name_str = ks; - } + fn decode_keyspace_name(keyspace_name: &[u8]) -> Option { + if keyspace_name.is_empty() { + return None; } + + String::from_utf8(keyspace_name.to_vec()) + .ok() + .filter(|value| !value.is_empty()) + } + + fn parse_tidb_record(record: TopSqlRecord, instance: String) -> Vec { + let keyspace_name_str = + Self::decode_keyspace_name(&record.keyspace_name).unwrap_or_default(); let mut events = vec![]; let instance_key = format!("topsql_tidb_{}", instance); let mut date = String::new(); @@ -237,8 +240,8 @@ impl TopSqlSubResponseParser { log.insert(LABEL_TIMESTAMPS, LogValue::from(item.timestamp_sec)); if date.is_empty() { date = chrono::DateTime::from_timestamp(item.timestamp_sec as i64, 0) - .map(|dt| dt.format("%Y-%m-%d").to_string()) - .unwrap_or_else(|| "1970-01-01".to_string()); + .map(|dt| dt.format("%Y-%m-%d").to_string()) + .unwrap_or_else(|| "1970-01-01".to_string()); } log.insert(LABEL_DATE, LogValue::from(date.clone())); log.insert(LABEL_INSTANCE_KEY, instance_key.clone()); @@ -282,12 +285,16 @@ impl TopSqlSubResponseParser { fn parse_tidb_sql_meta(sql_meta: SqlMeta) -> Vec { let mut events = vec![]; let sql_digest = hex::encode_upper(sql_meta.sql_digest); + let keyspace_name = Self::decode_keyspace_name(&sql_meta.keyspace_name); let mut event = Event::Log(LogEvent::default()); let log = event.as_mut_log(); log.insert(LABEL_SOURCE_TABLE, SOURCE_TABLE_TOPSQL_SQL_META); log.insert(LABEL_SQL_DIGEST, sql_digest); log.insert(LABEL_NORMALIZED_SQL, sql_meta.normalized_sql); + if let Some(keyspace_name) = keyspace_name { + log.insert(LABEL_KEYSPACE, keyspace_name); + } let now = Utc::now(); log.insert(LABEL_TIMESTAMPS, LogValue::from(now.timestamp())); let date_str = now.format("%Y-%m-%d").to_string(); @@ -299,8 +306,8 @@ impl TopSqlSubResponseParser { fn parse_tidb_plan_meta(plan_meta: PlanMeta) -> Vec { let mut events = vec![]; let plan_digest = hex::encode_upper(plan_meta.plan_digest); - let encoded_normalized_plan = - hex::encode_upper(plan_meta.encoded_normalized_plan); + let keyspace_name = Self::decode_keyspace_name(&plan_meta.keyspace_name); + let encoded_normalized_plan = hex::encode_upper(plan_meta.encoded_normalized_plan); let mut event = Event::Log(LogEvent::default()); let log = event.as_mut_log(); @@ -308,10 +315,10 @@ impl TopSqlSubResponseParser { log.insert(LABEL_SOURCE_TABLE, SOURCE_TABLE_TOPSQL_PLAN_META); log.insert(LABEL_PLAN_DIGEST, plan_digest); log.insert(LABEL_NORMALIZED_PLAN, plan_meta.normalized_plan); - log.insert( - LABEL_ENCODED_NORMALIZED_PLAN, - encoded_normalized_plan, - ); + log.insert(LABEL_ENCODED_NORMALIZED_PLAN, encoded_normalized_plan); + if let Some(keyspace_name) = keyspace_name { + log.insert(LABEL_KEYSPACE, keyspace_name); + } let now = Utc::now(); log.insert(LABEL_TIMESTAMPS, LogValue::from(now.timestamp())); let date_str = now.format("%Y-%m-%d").to_string(); @@ -320,16 +327,14 @@ impl TopSqlSubResponseParser { events } - fn parse_top_ru_record(record: crate::sources::topsql_v2::upstream::tidb::proto::TopRuRecord) -> Vec { + fn parse_top_ru_record( + record: crate::sources::topsql_v2::upstream::tidb::proto::TopRuRecord, + ) -> Vec { let mut events = vec![]; let mut date = String::new(); - let mut keyspace_name_str = "".to_string(); - if !record.keyspace_name.is_empty() { - if let Ok(ks) = String::from_utf8(record.keyspace_name.clone()) { - keyspace_name_str = ks; - } - } + let keyspace_name_str = + Self::decode_keyspace_name(&record.keyspace_name).unwrap_or_default(); for item in record.items { let mut event = Event::Log(LogEvent::default()); @@ -359,7 +364,10 @@ impl TopSqlSubResponseParser { ); log.insert(METRIC_NAME_TOTAL_RU, LogValue::from(item.total_ru)); log.insert(METRIC_NAME_EXEC_COUNT, LogValue::from(item.exec_count)); - log.insert(METRIC_NAME_EXEC_DURATION, LogValue::from(item.exec_duration)); + log.insert( + METRIC_NAME_EXEC_DURATION, + LogValue::from(item.exec_duration), + ); events.push(event.into_log()); } @@ -370,7 +378,9 @@ impl TopSqlSubResponseParser { #[cfg(test)] mod tests { use super::*; - use crate::sources::topsql_v2::upstream::tidb::proto::{TopSqlRecordItem, TopRuRecord, TopRuRecordItem}; + use crate::sources::topsql_v2::upstream::tidb::proto::{ + PlanMeta, SqlMeta, TopRuRecord, TopRuRecordItem, TopSqlRecordItem, + }; const MOCK_RECORDS: &'static str = include_str!("testdata/mock-records.json"); @@ -431,7 +441,7 @@ mod tests { let plan_digest = vec![4, 5, 6]; let timestamp = 1000u64; let test_keyspace_name = b"test_keyspace_2".to_vec(); - + // Create 5 records with same timestamp let items: Vec = (0..5) .map(|i| TopSqlRecordItem { @@ -445,7 +455,7 @@ mod tests { stmt_network_out_bytes: 200 + i as u64, }) .collect(); - + responses.push(TopSqlSubResponse { resp_oneof: Some(RespOneof::Record(TopSqlRecord { sql_digest: sql_digest.clone(), @@ -454,21 +464,24 @@ mod tests { keyspace_name: test_keyspace_name.clone(), })), }); - + // top_n = 10, which is greater than 5, so all should be kept let result = TopSqlSubResponseParser::keep_top_n(responses.clone(), 10); - + // Should have same number of responses (all kept) assert_eq!(result.len(), 1); if let Some(RespOneof::Record(record)) = &result[0].resp_oneof { assert_eq!(record.items.len(), 5); assert_eq!(record.sql_digest, sql_digest); assert_eq!(record.plan_digest, plan_digest); - assert_eq!(record.keyspace_name, test_keyspace_name, "keyspace_name should be preserved"); + assert_eq!( + record.keyspace_name, test_keyspace_name, + "keyspace_name should be preserved" + ); } else { panic!("Expected Record"); } - + // top_n = 5, which equals 5, so all should be kept let result2 = TopSqlSubResponseParser::keep_top_n(responses, 5); assert_eq!(result2.len(), 1); @@ -476,7 +489,10 @@ mod tests { assert_eq!(record.items.len(), 5); assert_eq!(record.sql_digest, sql_digest); assert_eq!(record.plan_digest, plan_digest); - assert_eq!(record.keyspace_name, test_keyspace_name, "keyspace_name should be preserved"); + assert_eq!( + record.keyspace_name, test_keyspace_name, + "keyspace_name should be preserved" + ); } else { panic!("Expected Record"); } @@ -491,7 +507,7 @@ mod tests { let plan_digest = vec![4, 5, 6]; let timestamp = 1000u64; let test_keyspace_name = b"test_keyspace_3".to_vec(); - + // Create 10 records with same cpu_time_ms and same network bytes let items: Vec = (0..10) .map(|_| TopSqlRecordItem { @@ -501,11 +517,11 @@ mod tests { stmt_kv_exec_count: BTreeMap::new(), stmt_duration_sum_ns: 1000, stmt_duration_count: 1, - stmt_network_in_bytes: 100, // All same + stmt_network_in_bytes: 100, // All same stmt_network_out_bytes: 200, // All same, total = 300 }) .collect(); - + responses.push(TopSqlSubResponse { resp_oneof: Some(RespOneof::Record(TopSqlRecord { sql_digest: sql_digest.clone(), @@ -514,44 +530,45 @@ mod tests { keyspace_name: test_keyspace_name.clone(), })), }); - + // top_n = 5, all values are same // New logic: threshold equals the value (top_n-th largest, which is the same value), // so no records satisfy > threshold condition, all should go to others let result = TopSqlSubResponseParser::keep_top_n(responses, 5); - + // Verify all records go to others let mut total_cpu_kept = 0u32; let mut total_network_kept = 0u64; let mut kept_count = 0; let mut total_cpu_others = 0u32; let mut total_network_others = 0u64; - + for response in result { if let Some(RespOneof::Record(record)) = response.resp_oneof { // Verify keyspace_name is preserved assert_eq!( - record.keyspace_name, - test_keyspace_name, + record.keyspace_name, test_keyspace_name, "keyspace_name should be preserved in all records" ); - + if record.sql_digest.is_empty() { // This is others for item in record.items { total_cpu_others += item.cpu_time_ms; - total_network_others += item.stmt_network_in_bytes + item.stmt_network_out_bytes; + total_network_others += + item.stmt_network_in_bytes + item.stmt_network_out_bytes; } } else { kept_count += record.items.len(); for item in record.items { total_cpu_kept += item.cpu_time_ms; - total_network_kept += item.stmt_network_in_bytes + item.stmt_network_out_bytes; + total_network_kept += + item.stmt_network_in_bytes + item.stmt_network_out_bytes; } } } } - + // New behavior: all records go to others (none satisfy > threshold when all values are same) assert_eq!(kept_count, 0); assert_eq!(total_cpu_kept, 0); @@ -568,7 +585,7 @@ mod tests { let mut responses = vec![]; let top_n = 3; let test_keyspace_name = b"test_keyspace_timestamps".to_vec(); - + // Timestamp 1000: 8 records mixing high CPU/low network, low CPU/high network, both high, both low // Expected: Keep records that meet either CPU threshold (>20) OR network threshold (>40) // Top 3 CPU: 100, 90, 80 -> threshold = 20 (4th largest) @@ -576,16 +593,16 @@ mod tests { let timestamp1 = 1000u64; let test_cases_ts1 = vec![ // (sql_id, plan_id, cpu_time_ms, network_in_bytes, network_out_bytes, reason) - (1, 1, 100, 10, 10), // High CPU (100), low network (20) -> keep (CPU > 20) - (2, 2, 90, 10, 10), // High CPU (90), low network (20) -> keep (CPU > 20) - (3, 3, 80, 10, 10), // High CPU (80), low network (20) -> keep (CPU > 20) + (1, 1, 100, 10, 10), // High CPU (100), low network (20) -> keep (CPU > 20) + (2, 2, 90, 10, 10), // High CPU (90), low network (20) -> keep (CPU > 20) + (3, 3, 80, 10, 10), // High CPU (80), low network (20) -> keep (CPU > 20) (4, 4, 10, 200, 200), // Low CPU (10), high network (400) -> keep (network > 40) (5, 5, 10, 175, 175), // Low CPU (10), high network (350) -> keep (network > 40) (6, 6, 10, 150, 150), // Low CPU (10), high network (300) -> keep (network > 40) - (7, 7, 20, 20, 20), // Low CPU (20), low network (40) -> evict (CPU == 20, network == 40) - (8, 8, 15, 15, 15), // Low CPU (15), low network (30) -> evict + (7, 7, 20, 20, 20), // Low CPU (20), low network (40) -> evict (CPU == 20, network == 40) + (8, 8, 15, 15, 15), // Low CPU (15), low network (30) -> evict ]; - + for (sql_id, plan_id, cpu_time, net_in, net_out) in test_cases_ts1.iter() { let sql_digest = vec![*sql_id]; let plan_digest = vec![*plan_id]; @@ -607,22 +624,22 @@ mod tests { })), }); } - + // Timestamp 2000: 7 records mixing different combinations // Expected: Keep records that meet either CPU threshold (>20) OR network threshold (>60) // Top 3 CPU: 100, 90, 70 -> threshold = 20 (4th largest) // Top 3 Network: 380, 360, 140 -> threshold = 60 (4th largest) let timestamp2 = 2000u64; let test_cases_ts2 = vec![ - (9, 9, 100, 10, 10), // High CPU (100), low network (20) -> keep (CPU > 20) - (10, 10, 90, 10, 10), // High CPU (90), low network (20) -> keep (CPU > 20) - (11, 11, 70, 10, 10), // High CPU (70), low network (20) -> keep (CPU > 20) + (9, 9, 100, 10, 10), // High CPU (100), low network (20) -> keep (CPU > 20) + (10, 10, 90, 10, 10), // High CPU (90), low network (20) -> keep (CPU > 20) + (11, 11, 70, 10, 10), // High CPU (70), low network (20) -> keep (CPU > 20) (12, 12, 10, 190, 190), // Low CPU (10), high network (380) -> keep (network > 60) (13, 13, 10, 180, 180), // Low CPU (10), high network (360) -> keep (network > 60) (14, 14, 10, 70, 70), // Low CPU (10), high network (140) -> keep (network > 60) - (15, 15, 20, 30, 30), // Low CPU (20), low network (60) -> evict (CPU == 20, network == 60) + (15, 15, 20, 30, 30), // Low CPU (20), low network (60) -> evict (CPU == 20, network == 60) ]; - + for (sql_id, plan_id, cpu_time, net_in, net_out) in test_cases_ts2.iter() { let sql_digest = vec![*sql_id]; let plan_digest = vec![*plan_id]; @@ -644,14 +661,11 @@ mod tests { })), }); } - + // Timestamp 3000: 2 records (both should be kept since 2 <= top_n=3) let timestamp3 = 3000u64; - let test_cases_ts3 = vec![ - (16, 16, 50, 50, 50), - (17, 17, 40, 40, 40), - ]; - + let test_cases_ts3 = vec![(16, 16, 50, 50, 50), (17, 17, 40, 40, 40)]; + for (sql_id, plan_id, cpu_time, net_in, net_out) in test_cases_ts3.iter() { let sql_digest = vec![*sql_id]; let plan_digest = vec![*plan_id]; @@ -673,26 +687,25 @@ mod tests { })), }); } - + let result = TopSqlSubResponseParser::keep_top_n(responses, top_n); - + // Group results by timestamp let mut results_by_timestamp: BTreeMap> = BTreeMap::new(); // timestamp -> [(sql_id, cpu, network), ...] let mut others_by_timestamp: BTreeMap = BTreeMap::new(); // timestamp -> (cpu, network) - + for response in result { if let Some(RespOneof::Record(record)) = response.resp_oneof { // Verify keyspace_name is preserved assert_eq!( - record.keyspace_name, - test_keyspace_name, + record.keyspace_name, test_keyspace_name, "keyspace_name should be preserved in all records" ); - + for item in record.items { let timestamp = item.timestamp_sec; let network_total = item.stmt_network_in_bytes + item.stmt_network_out_bytes; - + if record.sql_digest.is_empty() { // This is others let entry = others_by_timestamp.entry(timestamp).or_insert((0, 0)); @@ -709,7 +722,7 @@ mod tests { } } } - + // Verify timestamp 1000: should keep 6 records (3 high CPU + 3 high network), evict 2 // CPU threshold = 20 (4th largest), keep records with CPU > 20 // Network threshold = 40 (4th largest), keep records with network > 40 @@ -717,19 +730,47 @@ mod tests { .get(×tamp1) .map(|records| records.iter().map(|r| r.0).collect()) .unwrap_or_default(); - assert_eq!(ts1_kept.len(), 6, "Timestamp 1000 should keep 6 records (3 high CPU + 3 high network)"); + assert_eq!( + ts1_kept.len(), + 6, + "Timestamp 1000 should keep 6 records (3 high CPU + 3 high network)" + ); // High CPU records (1, 2, 3) should be kept - assert!(ts1_kept.contains(&1), "Timestamp 1000 should keep sql_id 1 (high CPU)"); - assert!(ts1_kept.contains(&2), "Timestamp 1000 should keep sql_id 2 (high CPU)"); - assert!(ts1_kept.contains(&3), "Timestamp 1000 should keep sql_id 3 (high CPU)"); + assert!( + ts1_kept.contains(&1), + "Timestamp 1000 should keep sql_id 1 (high CPU)" + ); + assert!( + ts1_kept.contains(&2), + "Timestamp 1000 should keep sql_id 2 (high CPU)" + ); + assert!( + ts1_kept.contains(&3), + "Timestamp 1000 should keep sql_id 3 (high CPU)" + ); // High network records (4, 5, 6) should be kept - assert!(ts1_kept.contains(&4), "Timestamp 1000 should keep sql_id 4 (high network)"); - assert!(ts1_kept.contains(&5), "Timestamp 1000 should keep sql_id 5 (high network)"); - assert!(ts1_kept.contains(&6), "Timestamp 1000 should keep sql_id 6 (high network)"); + assert!( + ts1_kept.contains(&4), + "Timestamp 1000 should keep sql_id 4 (high network)" + ); + assert!( + ts1_kept.contains(&5), + "Timestamp 1000 should keep sql_id 5 (high network)" + ); + assert!( + ts1_kept.contains(&6), + "Timestamp 1000 should keep sql_id 6 (high network)" + ); // Low both records (7, 8) should be evicted - assert!(!ts1_kept.contains(&7), "Timestamp 1000 should NOT keep sql_id 7 (low both)"); - assert!(!ts1_kept.contains(&8), "Timestamp 1000 should NOT keep sql_id 8 (low both)"); - + assert!( + !ts1_kept.contains(&7), + "Timestamp 1000 should NOT keep sql_id 7 (low both)" + ); + assert!( + !ts1_kept.contains(&8), + "Timestamp 1000 should NOT keep sql_id 8 (low both)" + ); + // Verify kept records meet at least one threshold if let Some(records) = results_by_timestamp.get(×tamp1) { let cpu_threshold = 20u32; @@ -744,14 +785,22 @@ mod tests { ); } } - + if let Some((others_cpu, others_network)) = others_by_timestamp.get(×tamp1) { - assert_eq!(*others_cpu, 20 + 15, "Timestamp 1000 others CPU should be 35 (20+15)"); - assert_eq!(*others_network, 40 + 30, "Timestamp 1000 others network should be 70 (40+30)"); + assert_eq!( + *others_cpu, + 20 + 15, + "Timestamp 1000 others CPU should be 35 (20+15)" + ); + assert_eq!( + *others_network, + 40 + 30, + "Timestamp 1000 others network should be 70 (40+30)" + ); } else { panic!("Timestamp 1000 should have others records"); } - + // Verify timestamp 2000: should keep 6 records (3 high CPU + 3 high network), evict 1 // CPU threshold = 20 (4th largest), keep records with CPU > 20 // Network threshold = 60 (4th largest), keep records with network > 60 @@ -759,18 +808,43 @@ mod tests { .get(×tamp2) .map(|records| records.iter().map(|r| r.0).collect()) .unwrap_or_default(); - assert_eq!(ts2_kept.len(), 6, "Timestamp 2000 should keep 6 records (3 high CPU + 3 high network)"); + assert_eq!( + ts2_kept.len(), + 6, + "Timestamp 2000 should keep 6 records (3 high CPU + 3 high network)" + ); // High CPU records (9, 10, 11) should be kept - assert!(ts2_kept.contains(&9), "Timestamp 2000 should keep sql_id 9 (high CPU)"); - assert!(ts2_kept.contains(&10), "Timestamp 2000 should keep sql_id 10 (high CPU)"); - assert!(ts2_kept.contains(&11), "Timestamp 2000 should keep sql_id 11 (high CPU)"); + assert!( + ts2_kept.contains(&9), + "Timestamp 2000 should keep sql_id 9 (high CPU)" + ); + assert!( + ts2_kept.contains(&10), + "Timestamp 2000 should keep sql_id 10 (high CPU)" + ); + assert!( + ts2_kept.contains(&11), + "Timestamp 2000 should keep sql_id 11 (high CPU)" + ); // High network records (12, 13, 14) should be kept - assert!(ts2_kept.contains(&12), "Timestamp 2000 should keep sql_id 12 (high network)"); - assert!(ts2_kept.contains(&13), "Timestamp 2000 should keep sql_id 13 (high network)"); - assert!(ts2_kept.contains(&14), "Timestamp 2000 should keep sql_id 14 (high network)"); + assert!( + ts2_kept.contains(&12), + "Timestamp 2000 should keep sql_id 12 (high network)" + ); + assert!( + ts2_kept.contains(&13), + "Timestamp 2000 should keep sql_id 13 (high network)" + ); + assert!( + ts2_kept.contains(&14), + "Timestamp 2000 should keep sql_id 14 (high network)" + ); // Low both record (15) should be evicted - assert!(!ts2_kept.contains(&15), "Timestamp 2000 should NOT keep sql_id 15 (low both)"); - + assert!( + !ts2_kept.contains(&15), + "Timestamp 2000 should NOT keep sql_id 15 (low both)" + ); + // Verify kept records meet at least one threshold if let Some(records) = results_by_timestamp.get(×tamp2) { let cpu_threshold = 20u32; @@ -785,28 +859,47 @@ mod tests { ); } } - + if let Some((others_cpu, others_network)) = others_by_timestamp.get(×tamp2) { assert_eq!(*others_cpu, 20, "Timestamp 2000 others CPU should be 20"); - assert_eq!(*others_network, 60, "Timestamp 2000 others network should be 60 (30+30)"); + assert_eq!( + *others_network, 60, + "Timestamp 2000 others network should be 60 (30+30)" + ); } else { panic!("Timestamp 2000 should have others records"); } - + // Verify timestamp 3000: should keep all 2 records (2 <= top_n=3) let ts3_kept: Vec = results_by_timestamp .get(×tamp3) .map(|records| records.iter().map(|r| r.0).collect()) .unwrap_or_default(); - assert_eq!(ts3_kept.len(), 2, "Timestamp 3000 should keep all 2 records"); - assert!(ts3_kept.contains(&16), "Timestamp 3000 should keep sql_id 16"); - assert!(ts3_kept.contains(&17), "Timestamp 3000 should keep sql_id 17"); - + assert_eq!( + ts3_kept.len(), + 2, + "Timestamp 3000 should keep all 2 records" + ); + assert!( + ts3_kept.contains(&16), + "Timestamp 3000 should keep sql_id 16" + ); + assert!( + ts3_kept.contains(&17), + "Timestamp 3000 should keep sql_id 17" + ); + // Timestamp 3000 should not have others since all records are kept - assert!(!others_by_timestamp.contains_key(×tamp3), "Timestamp 3000 should not have others"); - + assert!( + !others_by_timestamp.contains_key(×tamp3), + "Timestamp 3000 should not have others" + ); + // Verify total counts - let total_kept: usize = results_by_timestamp.values().map(|records| records.len()).sum(); + let total_kept: usize = results_by_timestamp + .values() + .map(|records| records.len()) + .sum(); assert_eq!(total_kept, 14, "Total kept records should be 14 (6+6+2)"); } @@ -877,7 +970,10 @@ mod tests { assert_eq!(sum_old.stmt_duration_count, sum_new.stmt_duration_count); assert_eq!(sum_old.stmt_duration_sum_ns, sum_new.stmt_duration_sum_ns); assert_eq!(sum_old.stmt_network_in_bytes, sum_new.stmt_network_in_bytes); - assert_eq!(sum_old.stmt_network_out_bytes, sum_new.stmt_network_out_bytes); + assert_eq!( + sum_old.stmt_network_out_bytes, + sum_new.stmt_network_out_bytes + ); } #[test] @@ -909,25 +1005,118 @@ mod tests { // Check first event let event1 = &events[0]; let log1 = event1; - assert_eq!(log1.get(LABEL_SOURCE_TABLE), Some(&LogValue::from(SOURCE_TABLE_TOPRU))); - assert_eq!(log1.get(LABEL_TIMESTAMPS), Some(&LogValue::from(1709646900))); + assert_eq!( + log1.get(LABEL_SOURCE_TABLE), + Some(&LogValue::from(SOURCE_TABLE_TOPRU)) + ); + assert_eq!( + log1.get(LABEL_TIMESTAMPS), + Some(&LogValue::from(1709646900)) + ); assert_eq!(log1.get(LABEL_DATE), Some(&LogValue::from("2024-03-05"))); - assert_eq!(log1.get(LABEL_KEYSPACE), Some(&LogValue::from("test_keyspace"))); + assert_eq!( + log1.get(LABEL_KEYSPACE), + Some(&LogValue::from("test_keyspace")) + ); assert_eq!(log1.get(LABEL_USER), Some(&LogValue::from("test_user"))); - assert_eq!(log1.get(LABEL_SQL_DIGEST), Some(&LogValue::from("73716C5F6469676573745F313233"))); - assert_eq!(log1.get(LABEL_PLAN_DIGEST), Some(&LogValue::from("706C616E5F6469676573745F343536"))); + assert_eq!( + log1.get(LABEL_SQL_DIGEST), + Some(&LogValue::from("73716C5F6469676573745F313233")) + ); + assert_eq!( + log1.get(LABEL_PLAN_DIGEST), + Some(&LogValue::from("706C616E5F6469676573745F343536")) + ); assert_eq!(log1.get(METRIC_NAME_TOTAL_RU), Some(&LogValue::from(100.5))); assert_eq!(log1.get(METRIC_NAME_EXEC_COUNT), Some(&LogValue::from(10))); - assert_eq!(log1.get(METRIC_NAME_EXEC_DURATION), Some(&LogValue::from(50000000))); + assert_eq!( + log1.get(METRIC_NAME_EXEC_DURATION), + Some(&LogValue::from(50000000)) + ); // Check second event let event2 = &events[1]; let log2 = event2; - assert_eq!(log2.get(LABEL_SOURCE_TABLE), Some(&LogValue::from(SOURCE_TABLE_TOPRU))); - assert_eq!(log2.get(LABEL_TIMESTAMPS), Some(&LogValue::from(1709646960))); + assert_eq!( + log2.get(LABEL_SOURCE_TABLE), + Some(&LogValue::from(SOURCE_TABLE_TOPRU)) + ); + assert_eq!( + log2.get(LABEL_TIMESTAMPS), + Some(&LogValue::from(1709646960)) + ); assert_eq!(log2.get(LABEL_DATE), Some(&LogValue::from("2024-03-05"))); assert_eq!(log2.get(METRIC_NAME_TOTAL_RU), Some(&LogValue::from(200.0))); assert_eq!(log2.get(METRIC_NAME_EXEC_COUNT), Some(&LogValue::from(20))); - assert_eq!(log2.get(METRIC_NAME_EXEC_DURATION), Some(&LogValue::from(100000000))); + assert_eq!( + log2.get(METRIC_NAME_EXEC_DURATION), + Some(&LogValue::from(100000000)) + ); + } + + #[test] + fn test_parse_tidb_sql_meta_includes_keyspace() { + let sql_meta = SqlMeta { + sql_digest: b"sql_digest".to_vec(), + normalized_sql: "select 1".to_string(), + is_internal_sql: false, + keyspace_name: b"test_keyspace".to_vec(), + }; + + let events = TopSqlSubResponseParser::parse_tidb_sql_meta(sql_meta); + assert_eq!(events.len(), 1); + + let log = &events[0]; + assert_eq!( + log.get(LABEL_SOURCE_TABLE), + Some(&LogValue::from(SOURCE_TABLE_TOPSQL_SQL_META)) + ); + assert_eq!( + log.get(LABEL_SQL_DIGEST), + Some(&LogValue::from("73716C5F646967657374")) + ); + assert_eq!( + log.get(LABEL_NORMALIZED_SQL), + Some(&LogValue::from("select 1")) + ); + assert_eq!( + log.get(LABEL_KEYSPACE), + Some(&LogValue::from("test_keyspace")) + ); + } + + #[test] + fn test_parse_tidb_plan_meta_includes_keyspace() { + let plan_meta = PlanMeta { + plan_digest: b"plan_digest".to_vec(), + normalized_plan: "Point_Get".to_string(), + encoded_normalized_plan: "encoded_plan".to_string(), + keyspace_name: b"test_keyspace".to_vec(), + }; + + let events = TopSqlSubResponseParser::parse_tidb_plan_meta(plan_meta); + assert_eq!(events.len(), 1); + + let log = &events[0]; + assert_eq!( + log.get(LABEL_SOURCE_TABLE), + Some(&LogValue::from(SOURCE_TABLE_TOPSQL_PLAN_META)) + ); + assert_eq!( + log.get(LABEL_PLAN_DIGEST), + Some(&LogValue::from("706C616E5F646967657374")) + ); + assert_eq!( + log.get(LABEL_NORMALIZED_PLAN), + Some(&LogValue::from("Point_Get")) + ); + assert_eq!( + log.get(LABEL_ENCODED_NORMALIZED_PLAN), + Some(&LogValue::from("656E636F6465645F706C616E")) + ); + assert_eq!( + log.get(LABEL_KEYSPACE), + Some(&LogValue::from("test_keyspace")) + ); } } From 665ee2eb12078f6b9196691f30157282582f2b1d Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Wed, 25 Mar 2026 14:29:17 +0800 Subject: [PATCH 18/26] Validate keyspace routing path templates --- src/common/keyspace_cluster.rs | 91 ++++++++++++++++++++ src/sinks/topsql_data_deltalake/arch.md | 2 +- src/sinks/topsql_data_deltalake/mod.rs | 6 +- src/sinks/topsql_data_deltalake/processor.rs | 49 +++++++++-- src/sinks/topsql_meta_deltalake/arch.md | 2 +- src/sinks/topsql_meta_deltalake/mod.rs | 6 +- src/sinks/topsql_meta_deltalake/processor.rs | 46 ++++++++-- 7 files changed, 187 insertions(+), 15 deletions(-) diff --git a/src/common/keyspace_cluster.rs b/src/common/keyspace_cluster.rs index 62a424d..fd1288c 100644 --- a/src/common/keyspace_cluster.rs +++ b/src/common/keyspace_cluster.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use std::fs; use std::num::NonZeroUsize; +use std::path::PathBuf; use std::sync::Arc; use std::time::Duration; @@ -27,6 +28,50 @@ pub struct KeyspaceRoute { pub cluster_id: String, } +pub fn path_contains_keyspace_route_segments(path: &str) -> bool { + let mut has_org_segment = false; + let mut has_cluster_segment = false; + for segment in path.split('/') { + if segment.starts_with("org=") { + has_org_segment = true; + } else if segment.starts_with("cluster=") { + has_cluster_segment = true; + } + } + + has_org_segment && has_cluster_segment +} + +pub fn validate_keyspace_route_template(path: &str) -> Result<(), String> { + if path_contains_keyspace_route_segments(path) { + return Ok(()); + } + + Err(format!( + "base_path must contain both `org=` and `cluster=` path segments when enable_keyspace_cluster_mapping is true; expected something like `.../org=xxx/cluster=xxx/...`, got: {}", + path + )) +} + +pub fn replace_keyspace_route_segments(base_path: &PathBuf, route: &KeyspaceRoute) -> PathBuf { + let path = base_path.to_string_lossy(); + let replaced = path + .split('/') + .map(|segment| { + if segment.starts_with("org=") { + format!("org={}", route.org_id) + } else if segment.starts_with("cluster=") { + format!("cluster={}", route.cluster_id) + } else { + segment.to_string() + } + }) + .collect::>() + .join("/"); + + PathBuf::from(replaced) +} + #[derive(Clone)] pub struct PdKeyspaceResolver { base_url: String, @@ -282,6 +327,52 @@ mod tests { assert!(!is_not_found_body(r#"{"message":"PD server not found"}"#)); } + #[test] + fn path_contains_keyspace_route_segments_requires_both_segments() { + assert!(path_contains_keyspace_route_segments( + "s3://bucket/deltalake/org=xxx/cluster=xxx/type=topsql" + )); + assert!(path_contains_keyspace_route_segments( + "/tmp/deltalake/org=xxx/cluster=xxx/type=topsql" + )); + assert!(!path_contains_keyspace_route_segments( + "s3://bucket/deltalake/org=xxx/type=topsql" + )); + assert!(!path_contains_keyspace_route_segments( + "/tmp/deltalake/type=topsql" + )); + } + + #[test] + fn replace_keyspace_route_segments_rewrites_template_values() { + let replaced = replace_keyspace_route_segments( + &PathBuf::from("s3://bucket/deltalake/org=xxx/cluster=xxx/type=topsql"), + &KeyspaceRoute { + org_id: "30018".to_string(), + cluster_id: "10155668891296301432".to_string(), + }, + ); + + assert_eq!( + replaced, + PathBuf::from( + "s3://bucket/deltalake/org=30018/cluster=10155668891296301432/type=topsql" + ) + ); + } + + #[test] + fn validate_keyspace_route_template_requires_org_and_cluster_segments() { + assert!( + validate_keyspace_route_template("/tmp/deltalake/org=xxx/cluster=xxx/type=topsql") + .is_ok() + ); + + let error = validate_keyspace_route_template("/tmp/deltalake/type=topsql").unwrap_err(); + assert!(error.contains("org=")); + assert!(error.contains("cluster=")); + } + #[tokio::test] async fn resolve_keyspace_uses_pd_keyspace_api_and_caches_result() { let request_count = Arc::new(AtomicUsize::new(0)); diff --git a/src/sinks/topsql_data_deltalake/arch.md b/src/sinks/topsql_data_deltalake/arch.md index 38d96ca..2a0d10c 100644 --- a/src/sinks/topsql_data_deltalake/arch.md +++ b/src/sinks/topsql_data_deltalake/arch.md @@ -56,7 +56,7 @@ pub struct TopSQLDataDeltaLakeConfig { - **SQL Digest Grouping**: Group by SQL digest - **Time Partitioning**: Partition by execution time - **Schema Optimization**: Optimized schema for TopSQL data -- **Keyspace-based Routing**: Optional PD keyspace lookup can prepend `org=/cluster=` path segments before the table layout, which is especially useful for `topru` data written to shared S3 prefixes +- **Keyspace-based Routing**: When `enable_keyspace_cluster_mapping = true`, `base_path` must already contain `org=xxx/cluster=xxx` template segments; the sink resolves keyspace via PD and replaces those template values with the routed `org` / `cluster` - **Component-based Path Layout**: TopSQL data is partitioned by `component=` and `instance=` ## Dependencies diff --git a/src/sinks/topsql_data_deltalake/mod.rs b/src/sinks/topsql_data_deltalake/mod.rs index e18be0b..1bb59d9 100644 --- a/src/sinks/topsql_data_deltalake/mod.rs +++ b/src/sinks/topsql_data_deltalake/mod.rs @@ -27,7 +27,7 @@ mod processor; // Import default functions from common module use crate::common::deltalake_s3; use crate::common::deltalake_writer::{default_batch_size, default_timeout_secs}; -use crate::common::keyspace_cluster::PdKeyspaceResolver; +use crate::common::keyspace_cluster::{validate_keyspace_route_template, PdKeyspaceResolver}; pub const fn default_enable_keyspace_cluster_mapping() -> bool { false @@ -188,6 +188,10 @@ impl DeltaLakeConfig { s3_service: Option<&S3Service>, _cx: SinkContext, ) -> vector::Result { + if self.enable_keyspace_cluster_mapping { + validate_keyspace_route_template(&self.base_path).map_err(vector::Error::from)?; + } + // For OSS with virtual hosted style, we may need to adjust the base_path format // to ensure object_store correctly parses the bucket let base_path = if let Some(_endpoint) = self.region.as_ref().and_then(|r| r.endpoint()) { diff --git a/src/sinks/topsql_data_deltalake/processor.rs b/src/sinks/topsql_data_deltalake/processor.rs index 1ea2d9a..7df7cfe 100644 --- a/src/sinks/topsql_data_deltalake/processor.rs +++ b/src/sinks/topsql_data_deltalake/processor.rs @@ -10,7 +10,10 @@ use vector_lib::event::{Event, LogEvent}; use vector_lib::sink::StreamSink; use crate::common::deltalake_writer::{DeltaLakeWriter, DeltaTableConfig, WriteConfig}; -use crate::common::keyspace_cluster::{KeyspaceRoute, PdKeyspaceResolver}; +use crate::common::keyspace_cluster::{ + path_contains_keyspace_route_segments, replace_keyspace_route_segments, KeyspaceRoute, + PdKeyspaceResolver, +}; use crate::sources::topsql_v2::upstream::consts::{ LABEL_DATE, LABEL_DB_NAME, LABEL_INSTANCE_KEY, LABEL_KEYSPACE, LABEL_PLAN_DIGEST, LABEL_REGION_ID, LABEL_SOURCE_TABLE, LABEL_SQL_DIGEST, LABEL_TABLE_ID, LABEL_TABLE_NAME, @@ -507,17 +510,19 @@ impl TopSQLDeltaLakeSink { fn build_table_path(&self, table_name: &str, route: Option<&KeyspaceRoute>) -> PathBuf { let (table_type, table_instance) = Self::table_partition_values(table_name); + let mut base_path = self.base_path.clone(); let mut segments = Vec::new(); if let Some(route) = route { - segments.push(format!("org={}", route.org_id)); - segments.push(format!("cluster={}", route.cluster_id)); + if path_contains_keyspace_route_segments(&self.base_path.to_string_lossy()) { + base_path = replace_keyspace_route_segments(&base_path, route); + } } segments.push(format!("component={}", table_type)); segments.push(format!("instance={}", table_instance)); let segment_refs: Vec<&str> = segments.iter().map(|segment| segment.as_str()).collect(); - Self::join_path(&self.base_path, &segment_refs) + Self::join_path(&base_path, &segment_refs) } fn table_partition_values(table_name: &str) -> (&str, &str) { @@ -736,7 +741,9 @@ mod tests { #[test] fn test_build_table_path_with_meta_route_for_s3() { let (sink, _) = TopSQLDeltaLakeSink::new_for_test( - PathBuf::from("s3://o11y-prod-shared-us-west-2-premium/deltalake"), + PathBuf::from( + "s3://o11y-prod-shared-us-west-2-premium/deltalake/org=xxx/cluster=xxx/type=topsql", + ), vec![], WriteConfig { batch_size: 1, @@ -758,7 +765,37 @@ mod tests { assert_eq!( table_path, PathBuf::from( - "s3://o11y-prod-shared-us-west-2-premium/deltalake/org=1369847559692509642/cluster=10110362358366286743/component=tidb/instance=127.0.0.1:10080" + "s3://o11y-prod-shared-us-west-2-premium/deltalake/org=1369847559692509642/cluster=10110362358366286743/type=topsql/component=tidb/instance=127.0.0.1:10080" + ) + ); + } + + #[test] + fn test_build_table_path_with_meta_route_replaces_template_for_local_path() { + let (sink, _) = TopSQLDeltaLakeSink::new_for_test( + PathBuf::from("/tmp/deltalake/org=xxx/cluster=xxx/type=topsql"), + vec![], + WriteConfig { + batch_size: 1, + timeout_secs: 0, + }, + 180, + None, + None, + ); + + let table_path = sink.build_table_path( + "topsql_tidb_127.0.0.1:10080", + Some(&KeyspaceRoute { + org_id: "30018".to_string(), + cluster_id: "101".to_string(), + }), + ); + + assert_eq!( + table_path, + PathBuf::from( + "/tmp/deltalake/org=30018/cluster=101/type=topsql/component=tidb/instance=127.0.0.1:10080" ) ); } diff --git a/src/sinks/topsql_meta_deltalake/arch.md b/src/sinks/topsql_meta_deltalake/arch.md index f1e5d66..51de133 100644 --- a/src/sinks/topsql_meta_deltalake/arch.md +++ b/src/sinks/topsql_meta_deltalake/arch.md @@ -57,7 +57,7 @@ pub struct TopSQLMetaDeltaLakeConfig { - **Schema Storage**: Store SQL schemas - **Query Plan Storage**: Store query execution plans - **Metadata Versioning**: Track metadata changes over time -- **Keyspace-aware Layout**: When `enable_keyspace_cluster_mapping = true`, metadata is written under `org=/cluster=/component=topsql_{sql_meta|plan_meta}` +- **Keyspace-aware Layout**: When `enable_keyspace_cluster_mapping = true`, `base_path` must already contain `org=xxx/cluster=xxx` template segments; metadata writes replace those template values and keep the remaining layout, such as `.../type=topsql/component=topsql_{sql_meta|plan_meta}` ## Dependencies diff --git a/src/sinks/topsql_meta_deltalake/mod.rs b/src/sinks/topsql_meta_deltalake/mod.rs index eb7d2e0..c91dcb9 100644 --- a/src/sinks/topsql_meta_deltalake/mod.rs +++ b/src/sinks/topsql_meta_deltalake/mod.rs @@ -27,7 +27,7 @@ mod processor; // Import default functions from common module use crate::common::deltalake_s3; use crate::common::deltalake_writer::{default_batch_size, default_timeout_secs}; -use crate::common::keyspace_cluster::PdKeyspaceResolver; +use crate::common::keyspace_cluster::{validate_keyspace_route_template, PdKeyspaceResolver}; pub const fn default_enable_keyspace_cluster_mapping() -> bool { false @@ -197,6 +197,10 @@ impl DeltaLakeConfig { s3_service: Option<&S3Service>, _cx: SinkContext, ) -> vector::Result { + if self.enable_keyspace_cluster_mapping { + validate_keyspace_route_template(&self.base_path).map_err(vector::Error::from)?; + } + // For OSS with virtual hosted style, we may need to adjust the base_path format // to ensure object_store correctly parses the bucket let base_path = if let Some(_endpoint) = self.region.as_ref().and_then(|r| r.endpoint()) { diff --git a/src/sinks/topsql_meta_deltalake/processor.rs b/src/sinks/topsql_meta_deltalake/processor.rs index e1d5d9d..fbdb6ec 100644 --- a/src/sinks/topsql_meta_deltalake/processor.rs +++ b/src/sinks/topsql_meta_deltalake/processor.rs @@ -11,6 +11,9 @@ use vector_lib::event::{Event, LogEvent}; use vector_lib::sink::StreamSink; use crate::common::deltalake_writer::{DeltaLakeWriter, DeltaTableConfig, WriteConfig}; +use crate::common::keyspace_cluster::{ + path_contains_keyspace_route_segments, replace_keyspace_route_segments, +}; use crate::common::keyspace_cluster::{KeyspaceRoute, PdKeyspaceResolver}; use crate::sources::topsql_v2::upstream::consts::{ LABEL_DATE, LABEL_ENCODED_NORMALIZED_PLAN, LABEL_KEYSPACE, LABEL_NORMALIZED_PLAN, @@ -488,15 +491,17 @@ impl TopSQLDeltaLakeSink { } fn build_table_path(&self, table_name: &str, route: Option<&KeyspaceRoute>) -> PathBuf { + let mut base_path = self.base_path.clone(); let mut segments = Vec::new(); if let Some(route) = route { - segments.push(format!("org={}", route.org_id)); - segments.push(format!("cluster={}", route.cluster_id)); + if path_contains_keyspace_route_segments(&self.base_path.to_string_lossy()) { + base_path = replace_keyspace_route_segments(&base_path, route); + } } segments.push(format!("component={}", table_name)); let segment_refs: Vec<&str> = segments.iter().map(|segment| segment.as_str()).collect(); - Self::join_path(&self.base_path, &segment_refs) + Self::join_path(&base_path, &segment_refs) } fn join_path(base_path: &PathBuf, segments: &[&str]) -> PathBuf { @@ -758,7 +763,9 @@ mod tests { #[test] fn test_build_table_path_with_keyspace_route_for_s3() { let (sink, _) = TopSQLDeltaLakeSink::new_for_test( - PathBuf::from("s3://o11y-prod-shared-us-west-2-premium/deltalake"), + PathBuf::from( + "s3://o11y-prod-shared-us-west-2-premium/deltalake/org=xxx/cluster=xxx/type=topsql", + ), vec![], WriteConfig { batch_size: 1, @@ -781,11 +788,40 @@ mod tests { assert_eq!( table_path, PathBuf::from( - "s3://o11y-prod-shared-us-west-2-premium/deltalake/org=30018/cluster=10155668891296301432/component=topsql_sql_meta" + "s3://o11y-prod-shared-us-west-2-premium/deltalake/org=30018/cluster=10155668891296301432/type=topsql/component=topsql_sql_meta" ) ); } + #[test] + fn test_build_table_path_with_keyspace_route_replaces_template_for_local_path() { + let (sink, _) = TopSQLDeltaLakeSink::new_for_test( + PathBuf::from("/tmp/test/org=xxx/cluster=xxx/type=topsql"), + vec![], + WriteConfig { + batch_size: 1, + timeout_secs: 0, + }, + 180, + None, + 10000, + None, + ); + + let table_path = sink.build_table_path( + SOURCE_TABLE_TOPSQL_PLAN_META, + Some(&KeyspaceRoute { + org_id: "30018".to_string(), + cluster_id: "101".to_string(), + }), + ); + + assert_eq!( + table_path, + PathBuf::from("/tmp/test/org=30018/cluster=101/type=topsql/component=topsql_plan_meta") + ); + } + #[test] fn test_build_table_path_without_keyspace_route_preserves_existing_layout() { let (sink, _) = create_test_sink_with_receiver(1); From dabaa594726f0aee4b5799f210aa4bd17ea93dd3 Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Thu, 26 Mar 2026 10:44:11 +0800 Subject: [PATCH 19/26] refactor: simplify manager response parsing and address review feedback - Replace recursive JSON extraction with direct serde deserialization for ActiveTiDBAddress, removing ~100 lines of guessing logic - Remove redundant normalize_namespaces wrapper method - Change PdKeyspaceResolver::new to take Option<&TlsConfig> instead of owned Option to avoid unnecessary clones - Add debug_assert on replace_keyspace_route_segments to catch org_id/cluster_id containing path separators - Add TODO for resolve_keyspace concurrent cache miss dedup - Add comment explaining org=/cluster= path segment convention --- src/common/keyspace_cluster.rs | 16 ++- src/common/topology/fetch/tidb_manager.rs | 138 ++-------------------- src/sinks/topsql_data_deltalake/mod.rs | 2 +- src/sinks/topsql_meta_deltalake/mod.rs | 2 +- 4 files changed, 28 insertions(+), 130 deletions(-) diff --git a/src/common/keyspace_cluster.rs b/src/common/keyspace_cluster.rs index fd1288c..49b1571 100644 --- a/src/common/keyspace_cluster.rs +++ b/src/common/keyspace_cluster.rs @@ -28,6 +28,9 @@ pub struct KeyspaceRoute { pub cluster_id: String, } +// Path segments use the exact prefixes `org=` and `cluster=` as a convention. +// These are Hive-style partition keys chosen for the storage path layout; +// callers must follow this convention when constructing base_path templates. pub fn path_contains_keyspace_route_segments(path: &str) -> bool { let mut has_org_segment = false; let mut has_cluster_segment = false; @@ -54,6 +57,10 @@ pub fn validate_keyspace_route_template(path: &str) -> Result<(), String> { } pub fn replace_keyspace_route_segments(base_path: &PathBuf, route: &KeyspaceRoute) -> PathBuf { + debug_assert!( + !route.org_id.contains('/') && !route.cluster_id.contains('/'), + "org_id and cluster_id must not contain path separators" + ); let path = base_path.to_string_lossy(); let replaced = path .split('/') @@ -85,9 +92,9 @@ struct PdKeyspaceMetadata { } impl PdKeyspaceResolver { - pub fn new(pd_address: impl Into, pd_tls: Option) -> Result { - let client = build_http_client(pd_tls.as_ref())?; - Ok(Self::new_with_client(pd_address, pd_tls.as_ref(), client)) + pub fn new(pd_address: impl Into, pd_tls: Option<&TlsConfig>) -> Result { + let client = build_http_client(pd_tls)?; + Ok(Self::new_with_client(pd_address, pd_tls, client)) } pub fn new_with_client( @@ -118,6 +125,9 @@ impl PdKeyspaceResolver { } } + // TODO: concurrent requests for the same keyspace can all miss the cache and + // issue duplicate HTTP calls. Consider an in-flight dedup mechanism if PD + // pressure becomes a concern. pub async fn resolve_keyspace( &self, keyspace_name: &str, diff --git a/src/common/topology/fetch/tidb_manager.rs b/src/common/topology/fetch/tidb_manager.rs index 60e2817..44aa6b1 100644 --- a/src/common/topology/fetch/tidb_manager.rs +++ b/src/common/topology/fetch/tidb_manager.rs @@ -1,7 +1,7 @@ use std::{collections::HashSet, env}; use hyper::body::HttpBody; -use serde_json::{Map, Value}; +use serde::Deserialize; use snafu::{ResultExt, Snafu}; use vector::http::HttpClient; @@ -12,7 +12,6 @@ use crate::common::topology::{Component, InstanceType}; const GET_ACTIVE_TIDB_PATH: &str = "/api/tidb/get_active_tidb"; const DEFAULT_TIDB_PRIMARY_PORT: u16 = 4000; const DEFAULT_TIDB_STATUS_PORT: u16 = 10080; -const MAX_RESPONSE_DEPTH: usize = 8; const MAX_MANAGER_RESPONSE_BYTES: usize = 8 * 1024 * 1024; const VECTOR_STS_REPLICA_COUNT_ENV: &str = "VECTOR_STS_REPLICA_COUNT"; const VECTOR_STS_ID_ENV: &str = "VECTOR_STS_ID"; @@ -39,12 +38,16 @@ pub enum FetchError { ParseTiDBHost { source: utils::ParseError }, } -#[derive(Debug, Clone, Eq, PartialEq)] +#[derive(Debug, Clone, Eq, PartialEq, Deserialize)] struct ActiveTiDBAddress { host: String, + #[serde(default)] port: Option, + #[serde(default)] status_port: Option, + #[serde(default)] hostname: Option, + #[serde(default)] keyspace_name: Option, } @@ -146,10 +149,6 @@ impl<'a> TiDBManagerTopologyFetcher<'a> { )) } - fn normalize_namespaces(namespaces: Option<&str>) -> Option { - normalize_namespace_list(namespaces) - } - fn build_active_tidb_endpoint_url(manager_server_address: &str, namespaces: &str) -> String { let mut endpoint = if manager_server_address.ends_with(GET_ACTIVE_TIDB_PATH) { manager_server_address.to_owned() @@ -178,8 +177,8 @@ impl<'a> TiDBManagerTopologyFetcher<'a> { fn parse_active_tidb_addresses_response( bytes: &[u8], ) -> Result, FetchError> { - let value = serde_json::from_slice::(bytes).context(ActiveTiDBJsonFromStrSnafu)?; - let addresses = Self::extract_active_tidb_addresses(&value, 0)?; + let addresses: Vec = + serde_json::from_slice(bytes).context(ActiveTiDBJsonFromStrSnafu)?; if addresses.is_empty() { return Err(FetchError::InvalidManagerResponse { @@ -190,98 +189,6 @@ impl<'a> TiDBManagerTopologyFetcher<'a> { Ok(addresses) } - fn extract_active_tidb_addresses( - value: &Value, - depth: usize, - ) -> Result, FetchError> { - if depth > MAX_RESPONSE_DEPTH { - return Err(FetchError::InvalidManagerResponse { - message: "response nesting is too deep".to_owned(), - }); - } - - match value { - Value::String(host) => Ok(vec![ActiveTiDBAddress { - host: host.clone(), - port: None, - status_port: None, - hostname: None, - keyspace_name: None, - }]), - Value::Array(items) => { - let mut addresses = Vec::new(); - for item in items { - addresses.extend(Self::extract_active_tidb_addresses(item, depth + 1)?); - } - Ok(addresses) - } - Value::Object(obj) => { - if let Some(address) = Self::extract_active_tidb_address_from_object(obj) { - return Ok(vec![address]); - } - - for key in [ - "data", - "result", - "active_tidb_addresses", - "tidb_addresses", - "active_tidbs", - "tidbs", - "addresses", - "instances", - "items", - "nodes", - "list", - ] { - if let Some(next_value) = obj.get(key) { - let addresses = Self::extract_active_tidb_addresses(next_value, depth + 1)?; - if !addresses.is_empty() { - return Ok(addresses); - } - } - } - - Ok(Vec::new()) - } - _ => Ok(Vec::new()), - } - } - - fn extract_active_tidb_address_from_object( - obj: &Map, - ) -> Option { - let host = Self::extract_string_field( - obj, - &["host", "address", "tidb_address", "active_tidb_address"], - )?; - let port = Self::extract_u16_field(obj, &["port", "primary_port"]); - let status_port = Self::extract_u16_field(obj, &["status_port", "secondary_port"]); - let hostname = Self::extract_string_field(obj, &["hostname", "pod_name", "instance_name"]); - let keyspace_name = - Self::extract_string_field(obj, &["keyspace_name", "keyspaceName", "keyspace"]); - - Some(ActiveTiDBAddress { - host, - port, - status_port, - hostname, - keyspace_name, - }) - } - - fn extract_string_field(obj: &Map, keys: &[&str]) -> Option { - keys.iter() - .find_map(|key| obj.get(*key).and_then(Value::as_str).map(str::to_owned)) - } - - fn extract_u16_field(obj: &Map, keys: &[&str]) -> Option { - keys.iter().find_map(|key| { - obj.get(*key) - .and_then(Value::as_u64) - .and_then(|raw| u16::try_from(raw).ok()) - }) - } - fn filter_active_tidb_addresses( active_tidb_addresses: Vec, shard_config: Option, @@ -472,25 +379,12 @@ mod tests { ); } - #[test] - fn parse_response_supports_keyspace_aliases() { - let bytes = br#"[ - {"host":"10.0.0.1","keyspace":"tenant-a"}, - {"host":"10.0.0.2","keyspaceName":"tenant-b"} - ]"#; - let addresses = - TiDBManagerTopologyFetcher::parse_active_tidb_addresses_response(bytes).unwrap(); - - assert_eq!(addresses[0].keyspace_name.as_deref(), Some("tenant-a")); - assert_eq!(addresses[1].keyspace_name.as_deref(), Some("tenant-b")); - } - #[test] fn parse_response_invalid_format() { let bytes = br#"{"code":0,"message":"ok"}"#; let err = TiDBManagerTopologyFetcher::parse_active_tidb_addresses_response(bytes) .expect_err("expected invalid manager response"); - assert!(matches!(err, FetchError::InvalidManagerResponse { .. })); + assert!(matches!(err, FetchError::ActiveTiDBJsonFromStr { .. })); } #[test] @@ -544,20 +438,14 @@ mod tests { #[test] fn normalize_namespaces_none_or_empty() { - assert_eq!(TiDBManagerTopologyFetcher::normalize_namespaces(None), None); - assert_eq!( - TiDBManagerTopologyFetcher::normalize_namespaces(Some("")), - None - ); - assert_eq!( - TiDBManagerTopologyFetcher::normalize_namespaces(Some(" , ")), - None - ); + assert_eq!(normalize_namespace_list(None), None); + assert_eq!(normalize_namespace_list(Some("")), None); + assert_eq!(normalize_namespace_list(Some(" , ")), None); } #[test] fn normalize_namespaces_trim_and_filter() { - let normalized = TiDBManagerTopologyFetcher::normalize_namespaces(Some( + let normalized = normalize_namespace_list(Some( " super-vip-tidb-pool, canary-super-vip-tidb-pool , ", )); assert_eq!( diff --git a/src/sinks/topsql_data_deltalake/mod.rs b/src/sinks/topsql_data_deltalake/mod.rs index 1bb59d9..1ac1265 100644 --- a/src/sinks/topsql_data_deltalake/mod.rs +++ b/src/sinks/topsql_data_deltalake/mod.rs @@ -248,7 +248,7 @@ impl DeltaLakeConfig { ) })?; Some( - PdKeyspaceResolver::new(pd_address, self.pd_tls.clone()).map_err(|error| { + PdKeyspaceResolver::new(pd_address, self.pd_tls.as_ref()).map_err(|error| { vector::Error::from(format!( "failed to build PD keyspace resolver from pd_address: {}", error diff --git a/src/sinks/topsql_meta_deltalake/mod.rs b/src/sinks/topsql_meta_deltalake/mod.rs index c91dcb9..fc7ea86 100644 --- a/src/sinks/topsql_meta_deltalake/mod.rs +++ b/src/sinks/topsql_meta_deltalake/mod.rs @@ -257,7 +257,7 @@ impl DeltaLakeConfig { ) })?; Some( - PdKeyspaceResolver::new(pd_address, self.pd_tls.clone()).map_err(|error| { + PdKeyspaceResolver::new(pd_address, self.pd_tls.as_ref()).map_err(|error| { vector::Error::from(format!( "failed to build PD keyspace resolver from pd_address: {}", error From da53898e5f4b57ed980f0fc83d39c5f9e3805c3a Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Thu, 26 Mar 2026 11:12:32 +0800 Subject: [PATCH 20/26] fix: treat empty manager active TiDB response as valid MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Empty array from manager is a legitimate response (no active TiDB instances). Previously it was treated as an error while filter-to-empty after keyspace sharding was treated as Ok — inconsistent behavior. Now both cases flow through get_up_tidbs which logs and returns Ok, making the behavior uniform. Also removes the now-unused InvalidManagerResponse error variant. --- src/common/topology/fetch/tidb_manager.rs | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/common/topology/fetch/tidb_manager.rs b/src/common/topology/fetch/tidb_manager.rs index 44aa6b1..4a727fa 100644 --- a/src/common/topology/fetch/tidb_manager.rs +++ b/src/common/topology/fetch/tidb_manager.rs @@ -30,8 +30,6 @@ pub enum FetchError { ActiveTiDBResponseTooLarge { limit_bytes: usize }, #[snafu(display("Failed to parse active tidb response JSON text: {}", source))] ActiveTiDBJsonFromStr { source: serde_json::Error }, - #[snafu(display("Invalid manager server response: {}", message))] - InvalidManagerResponse { message: String }, #[snafu(display("Invalid manager keyspace shard config: {}", message))] InvalidShardConfig { message: String }, #[snafu(display("Failed to parse tidb host from manager response: {}", source))] @@ -177,16 +175,7 @@ impl<'a> TiDBManagerTopologyFetcher<'a> { fn parse_active_tidb_addresses_response( bytes: &[u8], ) -> Result, FetchError> { - let addresses: Vec = - serde_json::from_slice(bytes).context(ActiveTiDBJsonFromStrSnafu)?; - - if addresses.is_empty() { - return Err(FetchError::InvalidManagerResponse { - message: "no active tidb addresses found".to_owned(), - }); - } - - Ok(addresses) + serde_json::from_slice(bytes).context(ActiveTiDBJsonFromStrSnafu) } fn filter_active_tidb_addresses( From 44d6dcb539f0f044a8c600c69b1bdd6392f11e25 Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Thu, 26 Mar 2026 11:38:29 +0800 Subject: [PATCH 21/26] fix: defer dedup LRU commit until flush succeeds to prevent data loss on retry process_events wrote dedup keys to LRU before flush_buffer persisted data to Delta Lake. If flush failed (e.g. PD keyspace lookup error), the retry path would find all keys already in LRU and silently skip them, causing permanent meta data loss. Fix: stage dedup keys in a pending_dedup_keys buffer during process_events (read-only LRU check). Commit to LRU only after flush_buffer succeeds. On failure, process_events_loop clears pending state so the cloned retry snapshot is re-processed cleanly. --- src/sinks/topsql_meta_deltalake/processor.rs | 74 ++++++++++++++++---- 1 file changed, 60 insertions(+), 14 deletions(-) diff --git a/src/sinks/topsql_meta_deltalake/processor.rs b/src/sinks/topsql_meta_deltalake/processor.rs index fbdb6ec..6942af8 100644 --- a/src/sinks/topsql_meta_deltalake/processor.rs +++ b/src/sinks/topsql_meta_deltalake/processor.rs @@ -129,6 +129,9 @@ pub struct TopSQLDeltaLakeSink { seen_keys_plan_meta: Arc>>, // Buffer for events to be flushed new_event_buffer: Arc>>, + // Dedup keys pending commit — written to LRU only after a successful flush. + // Each entry is (source_table, dedup_key) parallel to new_event_buffer. + pending_dedup_keys: Arc>>, // Last flush time last_flush_time: Arc>, } @@ -167,6 +170,7 @@ impl TopSQLDeltaLakeSink { std::num::NonZeroUsize::new(meta_cache_capacity).unwrap(), ))), // LRU cache with configurable capacity new_event_buffer: Arc::new(Mutex::new(Vec::new())), + pending_dedup_keys: Arc::new(Mutex::new(Vec::new())), last_flush_time: Arc::new(Mutex::new(Instant::now())), }; let sink_clone = sink.clone(); @@ -213,6 +217,7 @@ impl TopSQLDeltaLakeSink { std::num::NonZeroUsize::new(meta_cache_capacity).unwrap(), ))), // LRU cache with configurable capacity new_event_buffer: Arc::new(Mutex::new(Vec::new())), + pending_dedup_keys: Arc::new(Mutex::new(Vec::new())), last_flush_time: Arc::new(Mutex::new(Instant::now())), }; @@ -234,6 +239,11 @@ impl TopSQLDeltaLakeSink { Ok(()) => break, Err(error) => { error!("Failed to process events: {}", error); + // Clear stale state so the retry snapshot is processed + // from scratch — pending keys were never committed to + // LRU, and the buffer was already drained by flush. + self.pending_dedup_keys.lock().await.clear(); + self.new_event_buffer.lock().await.clear(); let Some(events) = retry_snapshot else { if retry_on_failure { error!( @@ -327,11 +337,25 @@ impl TopSQLDeltaLakeSink { // Group events by writer target so each org/cluster route gets its own table. let mut table_events: HashMap> = HashMap::new(); let mut resolved_routes: HashMap> = HashMap::new(); - for event in buffer.drain(..) { + let drained_events: Vec = buffer.drain(..).collect(); + drop(buffer); + + for event in drained_events { if let Event::Log(log_event) = event { if let Some(writer_key) = self .resolve_writer_key(&log_event, &mut resolved_routes) - .await? + .await + .map_err(|e| { + // Put events back into buffer so retry can re-process them. + // pending_dedup_keys are intentionally kept — they have not + // been committed to LRU yet, so the retry path in + // process_events_loop will re-drain them together. + // + // We cannot restore events here because we partially moved + // them; the retry relies on the cloned snapshot in + // process_events_loop instead. + e + })? { table_events .entry(writer_key) @@ -366,6 +390,21 @@ impl TopSQLDeltaLakeSink { } } + // Flush succeeded — commit pending dedup keys to LRU so future + // duplicates are correctly suppressed. + { + let mut seen_sql = self.seen_keys_sql_meta.lock().await; + let mut seen_plan = self.seen_keys_plan_meta.lock().await; + let mut pending = self.pending_dedup_keys.lock().await; + for (table_name, key) in pending.drain(..) { + match table_name.as_str() { + SOURCE_TABLE_TOPSQL_SQL_META => { seen_sql.put(key, ()); } + SOURCE_TABLE_TOPSQL_PLAN_META => { seen_plan.put(key, ()); } + _ => {} + } + } + } + // Update last flush time *self.last_flush_time.lock().await = Instant::now(); @@ -381,9 +420,10 @@ impl TopSQLDeltaLakeSink { return Ok(()); } - let mut seen_keys_sql_meta = self.seen_keys_sql_meta.lock().await; - let mut seen_keys_plan_meta = self.seen_keys_plan_meta.lock().await; + let seen_keys_sql_meta = self.seen_keys_sql_meta.lock().await; + let seen_keys_plan_meta = self.seen_keys_plan_meta.lock().await; let mut buffer = self.new_event_buffer.lock().await; + let mut pending_keys = self.pending_dedup_keys.lock().await; let last_flush = *self.last_flush_time.lock().await; let current_time = Instant::now(); let flush_interval = Duration::from_secs(self.max_delay_secs); @@ -396,21 +436,26 @@ impl TopSQLDeltaLakeSink { if let Some((table_name, key)) = self.extract_event_key(&log_event) { // Select the appropriate LRU cache based on table_name (source_table) let seen_keys = match table_name.as_str() { - SOURCE_TABLE_TOPSQL_SQL_META => &mut *seen_keys_sql_meta, - SOURCE_TABLE_TOPSQL_PLAN_META => &mut *seen_keys_plan_meta, + SOURCE_TABLE_TOPSQL_SQL_META => &*seen_keys_sql_meta, + SOURCE_TABLE_TOPSQL_PLAN_META => &*seen_keys_plan_meta, _ => continue, // Skip unknown event types }; - // Check if key is already in LRU cache - if seen_keys.get(&key).is_some() { - // Update key in LRU cache (touch it) - get() already does this + // Check if key is already committed in LRU cache + if seen_keys.peek(&key).is_some() { continue; - } else { - // Insert key to LRU cache - seen_keys.put(key.clone(), ()); - // Put event in buffer - buffer.push(Event::Log(log_event)); } + + // Check if key is already pending in this unflushed batch + if pending_keys.iter().any(|(t, k)| t == &table_name && k == &key) { + continue; + } + + // Stage the key — it will be committed to LRU only after + // flush_buffer succeeds, so a retry can re-process the + // same events without them being silently dropped. + pending_keys.push((table_name, key)); + buffer.push(Event::Log(log_event)); } // If key cannot be extracted, skip the event } @@ -427,6 +472,7 @@ impl TopSQLDeltaLakeSink { if buffer_full || time_reached { // Release buffer lock before flushing + drop(pending_keys); drop(buffer); // Flush buffer to deltalake From 4d3e974c9bf7b77059aa641f7d983170dfba51a3 Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Thu, 26 Mar 2026 15:47:54 +0800 Subject: [PATCH 22/26] fix: commit meta dedup keys only after successful writes --- src/sinks/topsql_meta_deltalake/processor.rs | 143 +++++++++++++++---- 1 file changed, 112 insertions(+), 31 deletions(-) diff --git a/src/sinks/topsql_meta_deltalake/processor.rs b/src/sinks/topsql_meta_deltalake/processor.rs index 6942af8..173b81f 100644 --- a/src/sinks/topsql_meta_deltalake/processor.rs +++ b/src/sinks/topsql_meta_deltalake/processor.rs @@ -330,41 +330,54 @@ impl TopSQLDeltaLakeSink { /// Flush buffer to Delta Lake async fn flush_buffer(&self) -> Result<(), Box> { let mut buffer = self.new_event_buffer.lock().await; + let mut pending_keys = self.pending_dedup_keys.lock().await; if buffer.is_empty() { + pending_keys.clear(); return Ok(()); } + let drained_events: Vec = buffer.drain(..).collect(); + let drained_pending_keys: Vec<(String, String)> = pending_keys.drain(..).collect(); + drop(buffer); + drop(pending_keys); + + if drained_events.len() != drained_pending_keys.len() { + return Err(format!( + "mismatched buffered events ({}) and pending dedup keys ({})", + drained_events.len(), + drained_pending_keys.len() + ) + .into()); + } + // Group events by writer target so each org/cluster route gets its own table. let mut table_events: HashMap> = HashMap::new(); + let mut table_dedup_keys: HashMap> = HashMap::new(); let mut resolved_routes: HashMap> = HashMap::new(); - let drained_events: Vec = buffer.drain(..).collect(); - drop(buffer); - for event in drained_events { + for (event, dedup_key) in drained_events + .into_iter() + .zip(drained_pending_keys.into_iter()) + { if let Event::Log(log_event) = event { if let Some(writer_key) = self .resolve_writer_key(&log_event, &mut resolved_routes) - .await - .map_err(|e| { - // Put events back into buffer so retry can re-process them. - // pending_dedup_keys are intentionally kept — they have not - // been committed to LRU yet, so the retry path in - // process_events_loop will re-drain them together. - // - // We cannot restore events here because we partially moved - // them; the retry relies on the cloned snapshot in - // process_events_loop instead. - e - })? + .await? { table_events - .entry(writer_key) + .entry(writer_key.clone()) .or_insert_with(Vec::new) .push(Event::Log(log_event)); + table_dedup_keys + .entry(writer_key) + .or_insert_with(Vec::new) + .push(dedup_key); } } } + let mut committed_dedup_keys = Vec::new(); + // Write table's events for (writer_key, mut events) in table_events { self.add_schema_info(&writer_key.table_name, &mut events); @@ -387,23 +400,13 @@ impl TopSQLDeltaLakeSink { e ); } + } else if let Some(keys) = table_dedup_keys.remove(&writer_key) { + committed_dedup_keys.extend(keys); } } - // Flush succeeded — commit pending dedup keys to LRU so future - // duplicates are correctly suppressed. - { - let mut seen_sql = self.seen_keys_sql_meta.lock().await; - let mut seen_plan = self.seen_keys_plan_meta.lock().await; - let mut pending = self.pending_dedup_keys.lock().await; - for (table_name, key) in pending.drain(..) { - match table_name.as_str() { - SOURCE_TABLE_TOPSQL_SQL_META => { seen_sql.put(key, ()); } - SOURCE_TABLE_TOPSQL_PLAN_META => { seen_plan.put(key, ()); } - _ => {} - } - } - } + // Commit dedup keys only for writes that actually succeeded. + self.commit_dedup_keys(committed_dedup_keys).await; // Update last flush time *self.last_flush_time.lock().await = Instant::now(); @@ -447,7 +450,10 @@ impl TopSQLDeltaLakeSink { } // Check if key is already pending in this unflushed batch - if pending_keys.iter().any(|(t, k)| t == &table_name && k == &key) { + if pending_keys + .iter() + .any(|(t, k)| t == &table_name && k == &key) + { continue; } @@ -626,6 +632,26 @@ impl TopSQLDeltaLakeSink { Ok(()) } + + async fn commit_dedup_keys(&self, dedup_keys: Vec<(String, String)>) { + if dedup_keys.is_empty() { + return; + } + + let mut seen_sql = self.seen_keys_sql_meta.lock().await; + let mut seen_plan = self.seen_keys_plan_meta.lock().await; + for (table_name, key) in dedup_keys { + match table_name.as_str() { + SOURCE_TABLE_TOPSQL_SQL_META => { + seen_sql.put(key, ()); + } + SOURCE_TABLE_TOPSQL_PLAN_META => { + seen_plan.put(key, ()); + } + _ => {} + } + } + } } fn route_resolution_retry_delay(retry_count: usize) -> Duration { @@ -730,6 +756,10 @@ impl StreamSink for TopSQLDeltaLakeSink { mod tests { use super::*; use futures::stream; + use hyper::service::{make_service_fn, service_fn}; + use hyper::{Body, Request, Response, Server}; + use std::convert::Infallible; + use std::net::TcpListener; use vector_lib::event::{LogEvent, Value as LogValue}; fn create_test_event(timestamp: i64) -> Event { @@ -806,6 +836,57 @@ mod tests { ); } + #[tokio::test] + async fn test_missing_route_does_not_commit_dedup_key() { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let address = listener.local_addr().unwrap(); + let server = + Server::from_tcp(listener) + .unwrap() + .serve(make_service_fn(move |_| async move { + Ok::<_, Infallible>(service_fn(move |_request: Request| async move { + Ok::<_, Infallible>(Response::new(Body::from( + r#"{"config":{"tenant_id":"30018"}}"#, + ))) + })) + })); + let server_handle = tokio::spawn(server); + + let client = reqwest::Client::builder().no_proxy().build().unwrap(); + let resolver = + PdKeyspaceResolver::new_with_client(format!("http://{}", address), None, client); + + let (sink, _) = TopSQLDeltaLakeSink::new_for_test( + PathBuf::from("/tmp/test/org=xxx/cluster=xxx/type=topsql"), + vec![], + WriteConfig { + batch_size: 1, + timeout_secs: 0, + }, + 0, + None, + 10000, + Some(resolver), + ); + + let log_event = create_meta_event( + SOURCE_TABLE_TOPSQL_SQL_META, + LABEL_SQL_DIGEST, + "SQL_DIGEST", + Some("ks-missing"), + ); + sink.process_events(vec![vec![Event::Log(log_event)]]) + .await + .unwrap(); + + assert_eq!(sink.seen_keys_sql_meta.lock().await.len(), 0); + assert_eq!(sink.seen_keys_plan_meta.lock().await.len(), 0); + assert!(sink.pending_dedup_keys.lock().await.is_empty()); + assert!(sink.new_event_buffer.lock().await.is_empty()); + + server_handle.abort(); + } + #[test] fn test_build_table_path_with_keyspace_route_for_s3() { let (sink, _) = TopSQLDeltaLakeSink::new_for_test( From 70c71e951f2e4d5fb715eef878ad7891990d26f1 Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Mon, 30 Mar 2026 22:51:30 +0800 Subject: [PATCH 23/26] refactor: extract route_resolution_retry_delay to common keyspace_cluster module Deduplicate identical retry delay function and constants from both topsql_data_deltalake and topsql_meta_deltalake processors into common/keyspace_cluster.rs. Move the associated test as well. --- src/common/keyspace_cluster.rs | 22 ++++++++++++++++ src/sinks/topsql_data_deltalake/processor.rs | 27 +++----------------- src/sinks/topsql_meta_deltalake/processor.rs | 15 ++--------- 3 files changed, 27 insertions(+), 37 deletions(-) diff --git a/src/common/keyspace_cluster.rs b/src/common/keyspace_cluster.rs index 49b1571..98b9c1d 100644 --- a/src/common/keyspace_cluster.rs +++ b/src/common/keyspace_cluster.rs @@ -19,6 +19,20 @@ const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); const CONNECT_TIMEOUT: Duration = Duration::from_secs(3); const DEFAULT_KEYSPACE_ROUTE_CACHE_CAPACITY: usize = 10_000; +const ROUTE_RESOLUTION_BASE_DELAY: Duration = Duration::from_secs(5); +const ROUTE_RESOLUTION_MAX_DELAY: Duration = Duration::from_secs(60); +pub const MAX_ROUTE_RESOLUTION_RETRIES: usize = 5; + +/// Exponential backoff delay for keyspace route resolution retries. +pub fn route_resolution_retry_delay(retry_count: usize) -> Duration { + let multiplier = 1u64 << retry_count.saturating_sub(1).min(6); + let delay_secs = ROUTE_RESOLUTION_BASE_DELAY + .as_secs() + .saturating_mul(multiplier) + .min(ROUTE_RESOLUTION_MAX_DELAY.as_secs()); + Duration::from_secs(delay_secs) +} + const ORG_ID_KEYS: &[&str] = &["serverless_tenant_id"]; const CLUSTER_ID_KEYS: &[&str] = &["serverless_cluster_id"]; @@ -521,4 +535,12 @@ mod tests { server_handle.abort(); } + + #[test] + fn route_resolution_retry_delay_caps_at_maximum() { + assert_eq!(route_resolution_retry_delay(1), Duration::from_secs(5)); + assert_eq!(route_resolution_retry_delay(2), Duration::from_secs(10)); + assert_eq!(route_resolution_retry_delay(5), Duration::from_secs(60)); + assert_eq!(route_resolution_retry_delay(8), Duration::from_secs(60)); + } } diff --git a/src/sinks/topsql_data_deltalake/processor.rs b/src/sinks/topsql_data_deltalake/processor.rs index 6b00d4a..b3004bc 100644 --- a/src/sinks/topsql_data_deltalake/processor.rs +++ b/src/sinks/topsql_data_deltalake/processor.rs @@ -1,7 +1,6 @@ use std::collections::HashMap; use std::path::PathBuf; use std::sync::Arc; -use std::time::Duration; use futures::{stream::BoxStream, StreamExt}; use tokio::sync::mpsc; @@ -11,8 +10,9 @@ use vector_lib::sink::StreamSink; use crate::common::deltalake_writer::{DeltaLakeWriter, DeltaTableConfig, WriteConfig}; use crate::common::keyspace_cluster::{ - path_contains_keyspace_route_segments, replace_keyspace_route_segments, KeyspaceRoute, - PdKeyspaceResolver, + path_contains_keyspace_route_segments, replace_keyspace_route_segments, + route_resolution_retry_delay, KeyspaceRoute, PdKeyspaceResolver, + MAX_ROUTE_RESOLUTION_RETRIES, }; use crate::sources::topsql_v2::upstream::consts::{ LABEL_DATE, LABEL_DB_NAME, LABEL_INSTANCE_KEY, LABEL_KEYSPACE, LABEL_PLAN_DIGEST, @@ -251,10 +251,6 @@ lazy_static! { }; } -const ROUTE_RESOLUTION_RETRY_DELAY: Duration = Duration::from_secs(5); -const MAX_ROUTE_RESOLUTION_RETRIES: usize = 5; -const MAX_ROUTE_RESOLUTION_RETRY_DELAY: Duration = Duration::from_secs(60); - /// Delta Lake sink processor #[derive(Clone)] pub struct TopSQLDeltaLakeSink { @@ -604,15 +600,6 @@ impl TopSQLDeltaLakeSink { } } -fn route_resolution_retry_delay(retry_count: usize) -> Duration { - let multiplier = 1u64 << retry_count.saturating_sub(1).min(6); - let delay_secs = ROUTE_RESOLUTION_RETRY_DELAY - .as_secs() - .saturating_mul(multiplier) - .min(MAX_ROUTE_RESOLUTION_RETRY_DELAY.as_secs()); - Duration::from_secs(delay_secs) -} - #[async_trait::async_trait] impl StreamSink for TopSQLDeltaLakeSink { async fn run(self: Box, input: BoxStream<'_, Event>) -> Result<(), ()> { @@ -732,14 +719,6 @@ mod tests { ) } - #[test] - fn test_route_resolution_retry_delay_caps_at_maximum() { - assert_eq!(route_resolution_retry_delay(1), Duration::from_secs(5)); - assert_eq!(route_resolution_retry_delay(2), Duration::from_secs(10)); - assert_eq!(route_resolution_retry_delay(5), Duration::from_secs(60)); - assert_eq!(route_resolution_retry_delay(8), Duration::from_secs(60)); - } - #[test] fn test_build_table_path_with_meta_route_for_s3() { let (sink, _) = TopSQLDeltaLakeSink::new_for_test( diff --git a/src/sinks/topsql_meta_deltalake/processor.rs b/src/sinks/topsql_meta_deltalake/processor.rs index f87a7a8..dff9526 100644 --- a/src/sinks/topsql_meta_deltalake/processor.rs +++ b/src/sinks/topsql_meta_deltalake/processor.rs @@ -13,8 +13,9 @@ use vector_lib::sink::StreamSink; use crate::common::deltalake_writer::{DeltaLakeWriter, DeltaTableConfig, WriteConfig}; use crate::common::keyspace_cluster::{ path_contains_keyspace_route_segments, replace_keyspace_route_segments, + route_resolution_retry_delay, KeyspaceRoute, PdKeyspaceResolver, + MAX_ROUTE_RESOLUTION_RETRIES, }; -use crate::common::keyspace_cluster::{KeyspaceRoute, PdKeyspaceResolver}; use crate::sources::topsql_v2::upstream::consts::{ LABEL_DATE, LABEL_ENCODED_NORMALIZED_PLAN, LABEL_KEYSPACE, LABEL_NORMALIZED_PLAN, LABEL_NORMALIZED_SQL, LABEL_PLAN_DIGEST, LABEL_SOURCE_TABLE, LABEL_SQL_DIGEST, @@ -108,9 +109,6 @@ lazy_static! { /// When buffer size exceeds this value, events will be flushed const EVENT_BUFFER_MAX_SIZE: usize = 1000; -const ROUTE_RESOLUTION_RETRY_DELAY: Duration = Duration::from_secs(5); -const MAX_ROUTE_RESOLUTION_RETRIES: usize = 5; -const MAX_ROUTE_RESOLUTION_RETRY_DELAY: Duration = Duration::from_secs(60); /// Delta Lake sink processor #[derive(Clone)] @@ -656,15 +654,6 @@ impl TopSQLDeltaLakeSink { } } -fn route_resolution_retry_delay(retry_count: usize) -> Duration { - let multiplier = 1u64 << retry_count.saturating_sub(1).min(6); - let delay_secs = ROUTE_RESOLUTION_RETRY_DELAY - .as_secs() - .saturating_mul(multiplier) - .min(MAX_ROUTE_RESOLUTION_RETRY_DELAY.as_secs()); - Duration::from_secs(delay_secs) -} - #[async_trait::async_trait] impl StreamSink for TopSQLDeltaLakeSink { async fn run(self: Box, input: BoxStream<'_, Event>) -> Result<(), ()> { From 36106611ddc349f2fdd173d14e48e028f085bdf6 Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Wed, 8 Apr 2026 11:40:01 +0800 Subject: [PATCH 24/26] revert unused changes Signed-off-by: zeminzhou --- src/sinks/topsql_data_deltalake/mod.rs | 9 +++++---- src/sinks/topsql_meta_deltalake/mod.rs | 9 +++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/sinks/topsql_data_deltalake/mod.rs b/src/sinks/topsql_data_deltalake/mod.rs index cbd2fd9..0e257ee 100644 --- a/src/sinks/topsql_data_deltalake/mod.rs +++ b/src/sinks/topsql_data_deltalake/mod.rs @@ -136,16 +136,17 @@ impl GenerateConfig for DeltaLakeConfig { impl SinkConfig for DeltaLakeConfig { async fn build(&self, cx: SinkContext) -> vector::Result<(VectorSink, Healthcheck)> { info!( - "DEBUG: Building Delta Lake sink with bucket: {:?}", - self.bucket + "Building Delta Lake sink with bucket: {:?}, base_path: {}", + self.bucket, self.base_path ); + let is_cloud_path = self.base_path.starts_with("s3://") || self.base_path.starts_with("abfss://") || self.base_path.starts_with("gs://"); - // Create S3 service if bucket is configured + // Create S3 service if bucket is configured (S3/OSS only) let s3_service = if self.bucket.is_some() { - info!("DEBUG: Bucket configured, creating S3 service"); + info!("Bucket configured, creating S3 service"); match self.create_service(&cx.proxy).await { Ok(service) => { info!("S3 service created successfully"); diff --git a/src/sinks/topsql_meta_deltalake/mod.rs b/src/sinks/topsql_meta_deltalake/mod.rs index 3ffa2d1..5641c54 100644 --- a/src/sinks/topsql_meta_deltalake/mod.rs +++ b/src/sinks/topsql_meta_deltalake/mod.rs @@ -145,16 +145,17 @@ impl GenerateConfig for DeltaLakeConfig { impl SinkConfig for DeltaLakeConfig { async fn build(&self, cx: SinkContext) -> vector::Result<(VectorSink, Healthcheck)> { info!( - "DEBUG: Building Delta Lake sink with bucket: {:?}", - self.bucket + "Building Delta Lake sink with bucket: {:?}, base_path: {}", + self.bucket, self.base_path ); + let is_cloud_path = self.base_path.starts_with("s3://") || self.base_path.starts_with("abfss://") || self.base_path.starts_with("gs://"); - // Create S3 service if bucket is configured + // Create S3 service if bucket is configured (S3/OSS only) let s3_service = if self.bucket.is_some() { - info!("DEBUG: Bucket configured, creating S3 service"); + info!("Bucket configured, creating S3 service"); match self.create_service(&cx.proxy).await { Ok(service) => { info!("S3 service created successfully"); From b65bea7fae4d6420d36f32cc8423c7a243efc7f6 Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Wed, 8 Apr 2026 12:23:15 +0800 Subject: [PATCH 25/26] fix: deduplicate concurrent PD keyspace lookups with per-keyspace locking Replace the single cache-miss-then-fetch pattern with double-check locking using per-keyspace Mutex guards. Concurrent resolve_keyspace calls for the same keyspace now serialize behind a shared lock so only the first caller issues the HTTP request; subsequent callers find the result in cache after acquiring the lock. Also extracts the HTTP fetch logic into fetch_keyspace_from_pd for clarity and removes the TODO comment. --- src/common/keyspace_cluster.rs | 65 ++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 19 deletions(-) diff --git a/src/common/keyspace_cluster.rs b/src/common/keyspace_cluster.rs index 98b9c1d..69d24e1 100644 --- a/src/common/keyspace_cluster.rs +++ b/src/common/keyspace_cluster.rs @@ -98,6 +98,8 @@ pub struct PdKeyspaceResolver { base_url: String, client: Client, cache: Arc>>, + /// Per-keyspace locks to deduplicate concurrent HTTP requests for the same keyspace. + keyspace_locks: Arc>>>>, } #[derive(Debug, Deserialize)] @@ -136,12 +138,10 @@ impl PdKeyspaceResolver { cache: Arc::new(Mutex::new(LruCache::new( NonZeroUsize::new(cache_capacity.max(1)).unwrap(), ))), + keyspace_locks: Arc::new(Mutex::new(HashMap::new())), } } - // TODO: concurrent requests for the same keyspace can all miss the cache and - // issue duplicate HTTP calls. Consider an in-flight dedup mechanism if PD - // pressure becomes a concern. pub async fn resolve_keyspace( &self, keyspace_name: &str, @@ -150,12 +150,50 @@ impl PdKeyspaceResolver { return Ok(None); } - let mut cache = self.cache.lock().await; - if let Some(cached) = cache.get(keyspace_name).cloned() { - return Ok(Some(cached)); + // Fast path: read lock on cache (via short-lived Mutex guard). + { + let mut cache = self.cache.lock().await; + if let Some(cached) = cache.get(keyspace_name).cloned() { + return Ok(Some(cached)); + } + } + + // Acquire a per-keyspace lock so only one request hits PD for the same keyspace. + let ks_lock = { + let mut locks = self.keyspace_locks.lock().await; + locks + .entry(keyspace_name.to_string()) + .or_default() + .clone() + }; + let _guard = ks_lock.lock().await; + + // Double-check: another request may have populated the cache while we waited. + { + let mut cache = self.cache.lock().await; + if let Some(cached) = cache.get(keyspace_name).cloned() { + return Ok(Some(cached)); + } + } + + let route = self.fetch_keyspace_from_pd(keyspace_name).await?; + + if let Some(route) = route.clone() { + self.cache + .lock() + .await + .put(keyspace_name.to_string(), route); } - drop(cache); + // Intentionally do not cache misses or transient failures so a later retry can recover + // once PD metadata becomes visible. + Ok(route) + } + + async fn fetch_keyspace_from_pd( + &self, + keyspace_name: &str, + ) -> Result, BoxError> { let encoded_keyspace = byte_serialize(keyspace_name.as_bytes()).collect::(); let response = self .client @@ -183,18 +221,7 @@ impl PdKeyspaceResolver { } let metadata: PdKeyspaceMetadata = response.json().await?; - let route = metadata.config.as_ref().and_then(extract_route_from_config); - - if let Some(route) = route.clone() { - self.cache - .lock() - .await - .put(keyspace_name.to_string(), route); - } - // Intentionally do not cache misses or transient failures so a later retry can recover - // once PD metadata becomes visible. - - Ok(route) + Ok(metadata.config.as_ref().and_then(extract_route_from_config)) } } From a232fa957eb9a5a9b9c030ac59915a8e7692eee7 Mon Sep 17 00:00:00 2001 From: zeminzhou Date: Wed, 8 Apr 2026 17:16:11 +0800 Subject: [PATCH 26/26] fix: clean up keyspace_locks on all exit paths to prevent unbounded growth Wrap the post-lock logic in an async block so the per-keyspace lock cleanup runs on every exit path: double-check cache hit, fetch success, and fetch error. Previously cleanup only ran on the straight-line success path, leaving map entries permanently for early-return callers. --- src/common/keyspace_cluster.rs | 49 +++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/src/common/keyspace_cluster.rs b/src/common/keyspace_cluster.rs index 69d24e1..9d2e568 100644 --- a/src/common/keyspace_cluster.rs +++ b/src/common/keyspace_cluster.rs @@ -168,26 +168,45 @@ impl PdKeyspaceResolver { }; let _guard = ks_lock.lock().await; - // Double-check: another request may have populated the cache while we waited. - { - let mut cache = self.cache.lock().await; - if let Some(cached) = cache.get(keyspace_name).cloned() { - return Ok(Some(cached)); + let result: Result, BoxError> = async { + // Double-check: another request may have populated the cache while we waited. + { + let mut cache = self.cache.lock().await; + if let Some(cached) = cache.get(keyspace_name).cloned() { + return Ok(Some(cached)); + } } - } - let route = self.fetch_keyspace_from_pd(keyspace_name).await?; + let route = self.fetch_keyspace_from_pd(keyspace_name).await?; + + if let Some(route) = route.clone() { + self.cache + .lock() + .await + .put(keyspace_name.to_string(), route); + } + // Intentionally do not cache misses or transient failures so a later retry can + // recover once PD metadata becomes visible. + + Ok(route) + } + .await; - if let Some(route) = route.clone() { - self.cache - .lock() - .await - .put(keyspace_name.to_string(), route); + // Clean up the per-keyspace lock if no other task is waiting on it. + // Runs on every exit path: cache hit, fetch success, and fetch error. + drop(_guard); + { + let mut locks = self.keyspace_locks.lock().await; + if let Some(lock) = locks.get(keyspace_name) { + // The HashMap holds one Arc and we cloned one into `ks_lock` (still alive). + // If strong_count == 2, no other task is queued, safe to remove. + if Arc::strong_count(lock) <= 2 { + locks.remove(keyspace_name); + } + } } - // Intentionally do not cache misses or transient failures so a later retry can recover - // once PD metadata becomes visible. - Ok(route) + result } async fn fetch_keyspace_from_pd(